|
@@ -1,6 +1,6 @@
|
|
|
// import pdf from 'pdf-parse';
|
|
|
// import fs from 'fs';
|
|
|
-import { CloudApi } from 'src/lib/ncloud';
|
|
|
+import { CloudApi, CloudObject, CloudQuery } from 'src/lib/ncloud';
|
|
|
import mammoth from "mammoth";
|
|
|
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
|
|
import { Document } from '@langchain/core/documents';
|
|
@@ -14,12 +14,47 @@ import { TensorFlowEmbeddings } from "@langchain/community/embeddings/tensorflow
|
|
|
|
|
|
export class AgentStory{
|
|
|
|
|
|
- url:string = ""
|
|
|
- content:string = ""
|
|
|
- docList:Array<Document> = []
|
|
|
- constructor(){
|
|
|
+ story:CloudObject|undefined
|
|
|
+ // 文件标题
|
|
|
+ title:string|undefined = ""
|
|
|
+ // 文档标签
|
|
|
+ tags:Array<string>|undefined
|
|
|
+ // 文件源地址
|
|
|
+ url:string|undefined = ""
|
|
|
+ // 文档完整纯文本内容
|
|
|
+ content:string|undefined = ""
|
|
|
+ // 文档hash唯一值
|
|
|
+ hash:string|undefined = ""
|
|
|
+ // 文档分割后的列表
|
|
|
+ docList:Array<Document|any> = []
|
|
|
+
|
|
|
+ constructor(metadata:{
|
|
|
+ url:string,
|
|
|
+ title?:string,
|
|
|
+ tags?:Array<string>
|
|
|
+ }){
|
|
|
+ this.url = metadata.url
|
|
|
+ this.title = metadata.title
|
|
|
+ this.tags = metadata.tags
|
|
|
setBackend()
|
|
|
}
|
|
|
+ async save(){
|
|
|
+ if(!this.hash){ return }
|
|
|
+ let query = new CloudQuery("Story");
|
|
|
+ query.equalTo("hash",this.hash);
|
|
|
+ let story = await query.first();
|
|
|
+ if(!story?.id){
|
|
|
+ story = new CloudObject("Story");
|
|
|
+ }
|
|
|
+ story.set({
|
|
|
+ title: this.title,
|
|
|
+ url: this.url,
|
|
|
+ content: this.content,
|
|
|
+ hash: this.hash,
|
|
|
+ tags:this.tags
|
|
|
+ })
|
|
|
+ this.story = await story.save();
|
|
|
+ }
|
|
|
async loader(url:string){
|
|
|
let api = new CloudApi();
|
|
|
|
|
@@ -34,6 +69,7 @@ export class AgentStory{
|
|
|
if(this.content){
|
|
|
this.url = url
|
|
|
}
|
|
|
+ this.save();
|
|
|
return this.content
|
|
|
}
|
|
|
|
|
@@ -49,8 +85,12 @@ export class AgentStory{
|
|
|
} catch (err) {
|
|
|
console.error(err);
|
|
|
}
|
|
|
+
|
|
|
+ this.hash = await arrayBufferToHASH(arrayBuffer)
|
|
|
+
|
|
|
// let html = mammoth.convertToHtml(buffer)
|
|
|
data = text?.value || "";
|
|
|
+ // 正则匹配所有 多个\n换行的字符 替换成一次换行
|
|
|
data = data.replaceAll(/\n+/g,"\n") // 剔除多余换行
|
|
|
return {data}
|
|
|
}
|
|
@@ -62,7 +102,7 @@ export class AgentStory{
|
|
|
// 默认:递归字符文本分割器
|
|
|
let splitter = new RecursiveCharacterTextSplitter({
|
|
|
chunkSize: options?.chunkSize || 500,
|
|
|
- chunkOverlap: options?.chunkOverlap || 100,
|
|
|
+ chunkOverlap: options?.chunkOverlap || 150,
|
|
|
});
|
|
|
|
|
|
let docOutput = await splitter.splitDocuments([
|
|
@@ -79,14 +119,51 @@ export class AgentStory{
|
|
|
* https://js.langchain.com/docs/integrations/text_embedding/tensorflow/
|
|
|
* @returns
|
|
|
*/
|
|
|
- // embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
|
|
|
+ // TensorFlow embedding vector(512) NOT NULL -- NOTE: 512 for Tensorflow
|
|
|
+ // OpenAI embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
|
|
|
async embedings(){
|
|
|
if(!this.docList?.length){return}
|
|
|
const embeddings = new TensorFlowEmbeddings();
|
|
|
let documentRes = await embeddings.embedDocuments(this.docList?.map(item=>item.pageContent));
|
|
|
console.log(documentRes);
|
|
|
+
|
|
|
+ // 向量持久化
|
|
|
+ documentRes.forEach(async (vector512:any,index)=>{
|
|
|
+ /**
|
|
|
+ * metadata
|
|
|
+ * pageContent
|
|
|
+ */
|
|
|
+ let document = this.docList[index]
|
|
|
+ this.docList[index].vector512 = vector512
|
|
|
+ let hash = await arrayBufferToHASH(stringToArrayBuffer(document?.pageContent))
|
|
|
+ let query = new CloudQuery("Document");
|
|
|
+ query.equalTo("hash",hash);
|
|
|
+ let docObj = await query.first()
|
|
|
+ if(!docObj?.id){
|
|
|
+ docObj = new CloudObject("Document");
|
|
|
+ }
|
|
|
+ docObj.set({
|
|
|
+ metadata:document?.metadata,
|
|
|
+ pageContent:document?.pageContent,
|
|
|
+ vector512:vector512,
|
|
|
+ hash:hash,
|
|
|
+ story:this.story?.toPointer(),
|
|
|
+ })
|
|
|
+ docObj.save();
|
|
|
+ })
|
|
|
return documentRes;
|
|
|
}
|
|
|
+ async destoryAllDocument(){
|
|
|
+ if(this.story?.id){
|
|
|
+ let query = new CloudQuery("Document");
|
|
|
+ query.equalTo("story",this.story?.id);
|
|
|
+ let docList = await query.find();
|
|
|
+ docList.forEach(doc=>{
|
|
|
+ doc.destroy();
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
export async function fetchFileBuffer(url: string): Promise<Buffer> {
|
|
@@ -132,4 +209,54 @@ async function setBackend(){
|
|
|
backend&&await tf.setBackend(backend);
|
|
|
await tf.ready();
|
|
|
return
|
|
|
- }
|
|
|
+ }
|
|
|
+
|
|
|
+ export async function arrayBufferToHASH(arrayBuffer:any) {
|
|
|
+ // 使用 SubtleCrypto API 计算哈希
|
|
|
+ const hashBuffer = await crypto.subtle.digest('SHA-256', arrayBuffer); // 使用 SHA-256 代替 MD5
|
|
|
+ const hashArray = Array.from(new Uint8Array(hashBuffer)); // 将缓冲区转换为字节数组
|
|
|
+ const hashHex = hashArray.map(b => ('00' + b.toString(16)).slice(-2)).join(''); // 转换为十六进制字符串
|
|
|
+ return hashHex;
|
|
|
+}
|
|
|
+export function stringToArrayBuffer(str:string) {
|
|
|
+ // 创建一个与字符串长度相同的Uint8Array
|
|
|
+ const encoder = new TextEncoder();
|
|
|
+ return encoder.encode(str).buffer;
|
|
|
+}
|
|
|
+export async function EmbedQuery(str:any):Promise<Array<number>>{
|
|
|
+ const embeddings = new TensorFlowEmbeddings();
|
|
|
+ let documentRes = await embeddings.embedQuery(str);
|
|
|
+ return documentRes
|
|
|
+}
|
|
|
+
|
|
|
+/** 向量余弦相似度计算 */
|
|
|
+export function RetriveAllDocument(vector1: Array<number>, docList: Array<any>): Array<any> {
|
|
|
+ docList.forEach(doc => {
|
|
|
+ const vector512 = doc.vector512;
|
|
|
+ doc.similarity = cosineSimilarity(vector1, vector512); // 计算余弦相似度并存储
|
|
|
+ });
|
|
|
+
|
|
|
+ // 按照相似度排序,降序排列
|
|
|
+ docList.sort((a, b) => b.similarity - a.similarity);
|
|
|
+
|
|
|
+ return docList; // 返回排序后的docList
|
|
|
+}
|
|
|
+function dotProduct(vectorA: number[], vectorB: number[]): number {
|
|
|
+ return vectorA.reduce((sum, value, index) => sum + value * vectorB[index], 0);
|
|
|
+}
|
|
|
+
|
|
|
+function magnitude(vector: number[]): number {
|
|
|
+ return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
|
|
|
+}
|
|
|
+
|
|
|
+function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
|
|
|
+ const dotProd = dotProduct(vectorA, vectorB);
|
|
|
+ const magnitudeA = magnitude(vectorA);
|
|
|
+ const magnitudeB = magnitude(vectorB);
|
|
|
+
|
|
|
+ if (magnitudeA === 0 || magnitudeB === 0) {
|
|
|
+ throw new Error("One or both vectors are zero vectors, cannot compute cosine similarity.");
|
|
|
+ }
|
|
|
+
|
|
|
+ return dotProd / (magnitudeA * magnitudeB);
|
|
|
+}
|