123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262 |
- // import pdf from 'pdf-parse';
- // import fs from 'fs';
- import { CloudApi, CloudObject, CloudQuery } from 'src/lib/ncloud';
- import mammoth from "mammoth";
- import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
- import { Document } from '@langchain/core/documents';
- import * as tf from '@tensorflow/tfjs-core';
- // import "@tensorflow/tfjs-backend-cpu";
- // import '@tensorflow/tfjs-backend-webgpu';
- import '@tensorflow/tfjs-backend-webgl';
- // import '@tensorflow/tfjs-backend-wasm';
- import { TensorFlowEmbeddings } from "@langchain/community/embeddings/tensorflow";
- export class AgentStory{
- story:CloudObject|undefined
- // 文件标题
- title:string|undefined = ""
- // 文档标签
- tags:Array<string>|undefined
- // 文件源地址
- url:string|undefined = ""
- // 文档完整纯文本内容
- content:string|undefined = ""
- // 文档hash唯一值
- hash:string|undefined = ""
- // 文档分割后的列表
- docList:Array<Document|any> = []
- constructor(metadata:{
- url:string,
- title?:string,
- tags?:Array<string>
- }){
- this.url = metadata.url
- this.title = metadata.title
- this.tags = metadata.tags
- setBackend()
- }
- async save(){
- if(!this.hash){ return }
- let query = new CloudQuery("Story");
- query.equalTo("hash",this.hash);
- let story = await query.first();
- if(!story?.id){
- story = new CloudObject("Story");
- }
- story.set({
- title: this.title,
- url: this.url,
- content: this.content,
- hash: this.hash,
- tags:this.tags
- })
- this.story = await story.save();
- }
- async loader(url:string){
- let api = new CloudApi();
- let result;
- if(url?.endsWith(".docx")){
- result = await this.loadDocx(url)
- }
- if(!result){
- result = await api.fetch("agent/loader",{url:url})
- }
- this.content = result?.data || null
- if(this.content){
- this.url = url
- }
- this.save();
- return this.content
- }
- async loadDocx(url:string){
- let data:any
- const response = await fetch(url);
- const arrayBuffer:any = await response.arrayBuffer();
-
- let text;
- try {
- text = await mammoth.extractRawText({arrayBuffer:arrayBuffer}); // 浏览器 直接传递 arrayBuffer
- } catch (err) {
- console.error(err);
- }
- this.hash = await arrayBufferToHASH(arrayBuffer)
- // let html = mammoth.convertToHtml(buffer)
- data = text?.value || "";
- // 正则匹配所有 多个\n换行的字符 替换成一次换行
- data = data.replaceAll(/\n+/g,"\n") // 剔除多余换行
- return {data}
- }
- async splitter(options?:{
- chunkSize:number,
- chunkOverlap:number
- }){
- if(!this.content) return
- // 默认:递归字符文本分割器
- let splitter = new RecursiveCharacterTextSplitter({
- chunkSize: options?.chunkSize || 500,
- chunkOverlap: options?.chunkOverlap || 150,
- });
-
- let docOutput = await splitter.splitDocuments([
- new Document({ pageContent: this.content }),
- ]);
- console.log(docOutput)
- this.docList = docOutput
- return this.docList
- }
- /**
- * 文本向量提取
- * @see
- * https://js.langchain.com/docs/integrations/text_embedding/tensorflow/
- * @returns
- */
- // TensorFlow embedding vector(512) NOT NULL -- NOTE: 512 for Tensorflow
- // OpenAI embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
- async embedings(){
- if(!this.docList?.length){return}
- const embeddings = new TensorFlowEmbeddings();
- let documentRes = await embeddings.embedDocuments(this.docList?.map(item=>item.pageContent));
- console.log(documentRes);
- // 向量持久化
- documentRes.forEach(async (vector512:any,index)=>{
- /**
- * metadata
- * pageContent
- */
- let document = this.docList[index]
- this.docList[index].vector512 = vector512
- let hash = await arrayBufferToHASH(stringToArrayBuffer(document?.pageContent))
- let query = new CloudQuery("Document");
- query.equalTo("hash",hash);
- let docObj = await query.first()
- if(!docObj?.id){
- docObj = new CloudObject("Document");
- }
- docObj.set({
- metadata:document?.metadata,
- pageContent:document?.pageContent,
- vector512:vector512,
- hash:hash,
- story:this.story?.toPointer(),
- })
- docObj.save();
- })
- return documentRes;
- }
- async destoryAllDocument(){
- if(this.story?.id){
- let query = new CloudQuery("Document");
- query.equalTo("story",this.story?.id);
- let docList = await query.find();
- docList.forEach(doc=>{
- doc.destroy();
- })
- }
-
- }
- }
- export async function fetchFileBuffer(url: string): Promise<Buffer> {
- const response = await fetch(url);
- if (!response.ok) {
- throw new Error(`Failed to fetch file: ${response.status} ${response.statusText}`);
- }
- const arrayBuffer = await response.arrayBuffer();
- return Buffer.from(arrayBuffer);
- }
- async function setBackend(){
- let backend
- let WebGPU = (navigator as any).gpu
- if (WebGPU) {
- // WebGPU is supported
- // console.log(WebGPU)
- backend = "webgpu"
- } else {
- // WebGPU is not supported
- }
- let glcanvas = document.createElement('canvas');
- let WebGL = glcanvas.getContext('webgl') || glcanvas.getContext('experimental-webgl');
- if (WebGL) {
- // console.log(WebGL)
- // WebGL is supported
- if(!backend) backend = "webgl"
- } else {
- // WebGL is not supported
- }
- if (typeof WebAssembly === 'object' && typeof WebAssembly.instantiate === 'function') {
- // WebAssembly is supported
- // console.log(WebAssembly)
- if(!backend) backend = "wasm"
- } else {
- // WebAssembly is not supported
- }
- backend&&await tf.setBackend(backend);
- await tf.ready();
- return
- }
- export async function arrayBufferToHASH(arrayBuffer:any) {
- // 使用 SubtleCrypto API 计算哈希
- const hashBuffer = await crypto.subtle.digest('SHA-256', arrayBuffer); // 使用 SHA-256 代替 MD5
- const hashArray = Array.from(new Uint8Array(hashBuffer)); // 将缓冲区转换为字节数组
- const hashHex = hashArray.map(b => ('00' + b.toString(16)).slice(-2)).join(''); // 转换为十六进制字符串
- return hashHex;
- }
- export function stringToArrayBuffer(str:string) {
- // 创建一个与字符串长度相同的Uint8Array
- const encoder = new TextEncoder();
- return encoder.encode(str).buffer;
- }
- export async function EmbedQuery(str:any):Promise<Array<number>>{
- const embeddings = new TensorFlowEmbeddings();
- let documentRes = await embeddings.embedQuery(str);
- return documentRes
- }
- /** 向量余弦相似度计算 */
- export function RetriveAllDocument(vector1: Array<number>, docList: Array<any>): Array<any> {
- docList.forEach(doc => {
- const vector512 = doc.vector512;
- doc.similarity = cosineSimilarity(vector1, vector512); // 计算余弦相似度并存储
- });
- // 按照相似度排序,降序排列
- docList.sort((a, b) => b.similarity - a.similarity);
- return docList; // 返回排序后的docList
- }
- function dotProduct(vectorA: number[], vectorB: number[]): number {
- return vectorA.reduce((sum, value, index) => sum + value * vectorB[index], 0);
- }
- function magnitude(vector: number[]): number {
- return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
- }
- function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
- const dotProd = dotProduct(vectorA, vectorB);
- const magnitudeA = magnitude(vectorA);
- const magnitudeB = magnitude(vectorB);
- if (magnitudeA === 0 || magnitudeB === 0) {
- throw new Error("One or both vectors are zero vectors, cannot compute cosine similarity.");
- }
- return dotProd / (magnitudeA * magnitudeB);
- }
|