|  | @@ -1,6 +1,6 @@
 | 
	
		
			
				|  |  |  // import pdf from 'pdf-parse';
 | 
	
		
			
				|  |  |  // import fs from 'fs';
 | 
	
		
			
				|  |  | -import { CloudApi } from 'src/lib/ncloud';
 | 
	
		
			
				|  |  | +import { CloudApi, CloudObject, CloudQuery } from 'src/lib/ncloud';
 | 
	
		
			
				|  |  |  import mammoth from "mammoth";
 | 
	
		
			
				|  |  |  import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
 | 
	
		
			
				|  |  |  import { Document } from '@langchain/core/documents';
 | 
	
	
		
			
				|  | @@ -14,12 +14,47 @@ import { TensorFlowEmbeddings } from "@langchain/community/embeddings/tensorflow
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  export class AgentStory{
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    url:string = ""
 | 
	
		
			
				|  |  | -    content:string = ""
 | 
	
		
			
				|  |  | -    docList:Array<Document> = []
 | 
	
		
			
				|  |  | -    constructor(){
 | 
	
		
			
				|  |  | +    story:CloudObject|undefined
 | 
	
		
			
				|  |  | +    // 文件标题
 | 
	
		
			
				|  |  | +    title:string|undefined = ""
 | 
	
		
			
				|  |  | +    // 文档标签
 | 
	
		
			
				|  |  | +    tags:Array<string>|undefined
 | 
	
		
			
				|  |  | +    // 文件源地址
 | 
	
		
			
				|  |  | +    url:string|undefined = ""
 | 
	
		
			
				|  |  | +    // 文档完整纯文本内容
 | 
	
		
			
				|  |  | +    content:string|undefined = ""
 | 
	
		
			
				|  |  | +    // 文档hash唯一值
 | 
	
		
			
				|  |  | +    hash:string|undefined = ""
 | 
	
		
			
				|  |  | +    // 文档分割后的列表
 | 
	
		
			
				|  |  | +    docList:Array<Document|any> = []
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    constructor(metadata:{
 | 
	
		
			
				|  |  | +        url:string,
 | 
	
		
			
				|  |  | +        title?:string,
 | 
	
		
			
				|  |  | +        tags?:Array<string>
 | 
	
		
			
				|  |  | +    }){
 | 
	
		
			
				|  |  | +        this.url = metadata.url
 | 
	
		
			
				|  |  | +        this.title = metadata.title
 | 
	
		
			
				|  |  | +        this.tags = metadata.tags
 | 
	
		
			
				|  |  |          setBackend()
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  | +    async save(){
 | 
	
		
			
				|  |  | +        if(!this.hash){ return }
 | 
	
		
			
				|  |  | +        let query = new CloudQuery("Story");
 | 
	
		
			
				|  |  | +        query.equalTo("hash",this.hash);
 | 
	
		
			
				|  |  | +        let story = await query.first();
 | 
	
		
			
				|  |  | +        if(!story?.id){
 | 
	
		
			
				|  |  | +            story = new CloudObject("Story");
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        story.set({
 | 
	
		
			
				|  |  | +            title: this.title,
 | 
	
		
			
				|  |  | +            url: this.url,
 | 
	
		
			
				|  |  | +            content: this.content,
 | 
	
		
			
				|  |  | +            hash: this.hash,
 | 
	
		
			
				|  |  | +            tags:this.tags
 | 
	
		
			
				|  |  | +        })
 | 
	
		
			
				|  |  | +        this.story = await story.save();
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  |      async loader(url:string){
 | 
	
		
			
				|  |  |          let api = new CloudApi();
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -34,6 +69,7 @@ export class AgentStory{
 | 
	
		
			
				|  |  |          if(this.content){
 | 
	
		
			
				|  |  |              this.url = url
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  | +        this.save();
 | 
	
		
			
				|  |  |          return this.content
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -49,8 +85,12 @@ export class AgentStory{
 | 
	
		
			
				|  |  |          } catch (err) {
 | 
	
		
			
				|  |  |              console.error(err);
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        this.hash = await arrayBufferToHASH(arrayBuffer)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |          // let html = mammoth.convertToHtml(buffer)
 | 
	
		
			
				|  |  |          data = text?.value || "";
 | 
	
		
			
				|  |  | +        // 正则匹配所有 多个\n换行的字符 替换成一次换行
 | 
	
		
			
				|  |  |          data = data.replaceAll(/\n+/g,"\n") // 剔除多余换行
 | 
	
		
			
				|  |  |          return {data}
 | 
	
		
			
				|  |  |      }
 | 
	
	
		
			
				|  | @@ -62,7 +102,7 @@ export class AgentStory{
 | 
	
		
			
				|  |  |          // 默认:递归字符文本分割器
 | 
	
		
			
				|  |  |          let splitter = new RecursiveCharacterTextSplitter({
 | 
	
		
			
				|  |  |              chunkSize: options?.chunkSize || 500,
 | 
	
		
			
				|  |  | -            chunkOverlap: options?.chunkOverlap || 100,
 | 
	
		
			
				|  |  | +            chunkOverlap: options?.chunkOverlap || 150,
 | 
	
		
			
				|  |  |          });
 | 
	
		
			
				|  |  |            
 | 
	
		
			
				|  |  |          let docOutput = await splitter.splitDocuments([
 | 
	
	
		
			
				|  | @@ -79,14 +119,51 @@ export class AgentStory{
 | 
	
		
			
				|  |  |       * https://js.langchain.com/docs/integrations/text_embedding/tensorflow/
 | 
	
		
			
				|  |  |       * @returns 
 | 
	
		
			
				|  |  |       */
 | 
	
		
			
				|  |  | -    //  embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
 | 
	
		
			
				|  |  | +    //  TensorFlow embedding vector(512) NOT NULL -- NOTE: 512 for Tensorflow
 | 
	
		
			
				|  |  | +    //  OpenAI embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
 | 
	
		
			
				|  |  |      async embedings(){
 | 
	
		
			
				|  |  |          if(!this.docList?.length){return}
 | 
	
		
			
				|  |  |          const embeddings = new TensorFlowEmbeddings();
 | 
	
		
			
				|  |  |          let documentRes = await embeddings.embedDocuments(this.docList?.map(item=>item.pageContent));
 | 
	
		
			
				|  |  |          console.log(documentRes);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        // 向量持久化
 | 
	
		
			
				|  |  | +        documentRes.forEach(async (vector512:any,index)=>{
 | 
	
		
			
				|  |  | +            /**
 | 
	
		
			
				|  |  | +             * metadata
 | 
	
		
			
				|  |  | +             * pageContent
 | 
	
		
			
				|  |  | +             */
 | 
	
		
			
				|  |  | +            let document = this.docList[index]
 | 
	
		
			
				|  |  | +            this.docList[index].vector512 = vector512
 | 
	
		
			
				|  |  | +            let hash = await arrayBufferToHASH(stringToArrayBuffer(document?.pageContent))
 | 
	
		
			
				|  |  | +            let query = new CloudQuery("Document");
 | 
	
		
			
				|  |  | +            query.equalTo("hash",hash);
 | 
	
		
			
				|  |  | +            let docObj = await query.first()
 | 
	
		
			
				|  |  | +            if(!docObj?.id){
 | 
	
		
			
				|  |  | +                docObj = new CloudObject("Document");
 | 
	
		
			
				|  |  | +            }
 | 
	
		
			
				|  |  | +            docObj.set({
 | 
	
		
			
				|  |  | +                metadata:document?.metadata,
 | 
	
		
			
				|  |  | +                pageContent:document?.pageContent,
 | 
	
		
			
				|  |  | +                vector512:vector512,
 | 
	
		
			
				|  |  | +                hash:hash,
 | 
	
		
			
				|  |  | +                story:this.story?.toPointer(),
 | 
	
		
			
				|  |  | +            })
 | 
	
		
			
				|  |  | +            docObj.save();
 | 
	
		
			
				|  |  | +        })
 | 
	
		
			
				|  |  |          return documentRes;
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  | +    async destoryAllDocument(){
 | 
	
		
			
				|  |  | +        if(this.story?.id){
 | 
	
		
			
				|  |  | +            let query = new CloudQuery("Document");
 | 
	
		
			
				|  |  | +            query.equalTo("story",this.story?.id);
 | 
	
		
			
				|  |  | +            let docList = await query.find();
 | 
	
		
			
				|  |  | +            docList.forEach(doc=>{
 | 
	
		
			
				|  |  | +                doc.destroy();
 | 
	
		
			
				|  |  | +            })
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +        
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  export async function fetchFileBuffer(url: string): Promise<Buffer> {
 | 
	
	
		
			
				|  | @@ -132,4 +209,54 @@ async function setBackend(){
 | 
	
		
			
				|  |  |          backend&&await tf.setBackend(backend);
 | 
	
		
			
				|  |  |          await tf.ready();
 | 
	
		
			
				|  |  |          return
 | 
	
		
			
				|  |  | -  }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  export async function arrayBufferToHASH(arrayBuffer:any) {
 | 
	
		
			
				|  |  | +    // 使用 SubtleCrypto API 计算哈希
 | 
	
		
			
				|  |  | +    const hashBuffer = await crypto.subtle.digest('SHA-256', arrayBuffer); // 使用 SHA-256 代替 MD5
 | 
	
		
			
				|  |  | +    const hashArray = Array.from(new Uint8Array(hashBuffer)); // 将缓冲区转换为字节数组
 | 
	
		
			
				|  |  | +    const hashHex = hashArray.map(b => ('00' + b.toString(16)).slice(-2)).join(''); // 转换为十六进制字符串
 | 
	
		
			
				|  |  | +    return hashHex;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +export function stringToArrayBuffer(str:string) {
 | 
	
		
			
				|  |  | +    // 创建一个与字符串长度相同的Uint8Array
 | 
	
		
			
				|  |  | +    const encoder = new TextEncoder();
 | 
	
		
			
				|  |  | +    return encoder.encode(str).buffer;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +export async function EmbedQuery(str:any):Promise<Array<number>>{
 | 
	
		
			
				|  |  | +    const embeddings = new TensorFlowEmbeddings();
 | 
	
		
			
				|  |  | +    let documentRes = await embeddings.embedQuery(str);
 | 
	
		
			
				|  |  | +    return documentRes
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +/** 向量余弦相似度计算 */
 | 
	
		
			
				|  |  | +export function RetriveAllDocument(vector1: Array<number>, docList: Array<any>): Array<any> {
 | 
	
		
			
				|  |  | +    docList.forEach(doc => {
 | 
	
		
			
				|  |  | +        const vector512 = doc.vector512;
 | 
	
		
			
				|  |  | +        doc.similarity = cosineSimilarity(vector1, vector512); // 计算余弦相似度并存储
 | 
	
		
			
				|  |  | +    });
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // 按照相似度排序,降序排列
 | 
	
		
			
				|  |  | +    docList.sort((a, b) => b.similarity - a.similarity);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return docList; // 返回排序后的docList
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +function dotProduct(vectorA: number[], vectorB: number[]): number {
 | 
	
		
			
				|  |  | +    return vectorA.reduce((sum, value, index) => sum + value * vectorB[index], 0);
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +function magnitude(vector: number[]): number {
 | 
	
		
			
				|  |  | +    return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
 | 
	
		
			
				|  |  | +    const dotProd = dotProduct(vectorA, vectorB);
 | 
	
		
			
				|  |  | +    const magnitudeA = magnitude(vectorA);
 | 
	
		
			
				|  |  | +    const magnitudeB = magnitude(vectorB);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (magnitudeA === 0 || magnitudeB === 0) {
 | 
	
		
			
				|  |  | +        throw new Error("One or both vectors are zero vectors, cannot compute cosine similarity.");
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return dotProd / (magnitudeA * magnitudeB);
 | 
	
		
			
				|  |  | +}
 |