story.ts 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. // import pdf from 'pdf-parse';
  2. // import fs from 'fs';
  3. import { CloudApi, CloudObject, CloudQuery } from 'src/lib/ncloud';
  4. import mammoth from "mammoth";
  5. import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
  6. import { Document } from '@langchain/core/documents';
  7. import * as tf from '@tensorflow/tfjs-core';
  8. // import "@tensorflow/tfjs-backend-cpu";
  9. // import '@tensorflow/tfjs-backend-webgpu';
  10. import '@tensorflow/tfjs-backend-webgl';
  11. // import '@tensorflow/tfjs-backend-wasm';
  12. import { TensorFlowEmbeddings } from "@langchain/community/embeddings/tensorflow";
  13. export class AgentStory{
  14. story:CloudObject|undefined
  15. // 文件标题
  16. title:string|undefined = ""
  17. // 文档标签
  18. tags:Array<string>|undefined
  19. // 文件源地址
  20. url:string|undefined = ""
  21. // 文档完整纯文本内容
  22. content:string|undefined = ""
  23. // 文档hash唯一值
  24. hash:string|undefined = ""
  25. // 文档分割后的列表
  26. docList:Array<Document|any> = []
  27. constructor(metadata:{
  28. url:string,
  29. title?:string,
  30. tags?:Array<string>
  31. }){
  32. this.url = metadata.url
  33. this.title = metadata.title
  34. this.tags = metadata.tags
  35. setBackend()
  36. }
  37. async save(){
  38. if(!this.hash){ return }
  39. let query = new CloudQuery("Story");
  40. query.equalTo("hash",this.hash);
  41. let story = await query.first();
  42. if(!story?.id){
  43. story = new CloudObject("Story");
  44. }
  45. story.set({
  46. title: this.title,
  47. url: this.url,
  48. content: this.content,
  49. hash: this.hash,
  50. tags:this.tags
  51. })
  52. this.story = await story.save();
  53. }
  54. async loader(url:string){
  55. let api = new CloudApi();
  56. let result;
  57. if(url?.endsWith(".docx")){
  58. result = await this.loadDocx(url)
  59. }
  60. if(!result){
  61. result = await api.fetch("agent/loader",{url:url})
  62. }
  63. this.content = result?.data || null
  64. if(this.content){
  65. this.url = url
  66. }
  67. this.save();
  68. return this.content
  69. }
  70. async loadDocx(url:string){
  71. let data:any
  72. const response = await fetch(url);
  73. const arrayBuffer:any = await response.arrayBuffer();
  74. let text;
  75. try {
  76. text = await mammoth.extractRawText({arrayBuffer:arrayBuffer}); // 浏览器 直接传递 arrayBuffer
  77. } catch (err) {
  78. console.error(err);
  79. }
  80. this.hash = await arrayBufferToHASH(arrayBuffer)
  81. // let html = mammoth.convertToHtml(buffer)
  82. data = text?.value || "";
  83. // 正则匹配所有 多个\n换行的字符 替换成一次换行
  84. data = data.replaceAll(/\n+/g,"\n") // 剔除多余换行
  85. return {data}
  86. }
  87. async splitter(options?:{
  88. chunkSize:number,
  89. chunkOverlap:number
  90. }){
  91. if(!this.content) return
  92. // 默认:递归字符文本分割器
  93. let splitter = new RecursiveCharacterTextSplitter({
  94. chunkSize: options?.chunkSize || 500,
  95. chunkOverlap: options?.chunkOverlap || 150,
  96. });
  97. let docOutput = await splitter.splitDocuments([
  98. new Document({ pageContent: this.content }),
  99. ]);
  100. console.log(docOutput)
  101. this.docList = docOutput
  102. return this.docList
  103. }
  104. /**
  105. * 文本向量提取
  106. * @see
  107. * https://js.langchain.com/docs/integrations/text_embedding/tensorflow/
  108. * @returns
  109. */
  110. // TensorFlow embedding vector(512) NOT NULL -- NOTE: 512 for Tensorflow
  111. // OpenAI embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
  112. async embedings(){
  113. if(!this.docList?.length){return}
  114. const embeddings = new TensorFlowEmbeddings();
  115. let documentRes = await embeddings.embedDocuments(this.docList?.map(item=>item.pageContent));
  116. console.log(documentRes);
  117. // 向量持久化
  118. documentRes.forEach(async (vector512:any,index)=>{
  119. /**
  120. * metadata
  121. * pageContent
  122. */
  123. let document = this.docList[index]
  124. this.docList[index].vector512 = vector512
  125. let hash = await arrayBufferToHASH(stringToArrayBuffer(document?.pageContent))
  126. let query = new CloudQuery("Document");
  127. query.equalTo("hash",hash);
  128. let docObj = await query.first()
  129. if(!docObj?.id){
  130. docObj = new CloudObject("Document");
  131. }
  132. docObj.set({
  133. metadata:document?.metadata,
  134. pageContent:document?.pageContent,
  135. vector512:vector512,
  136. hash:hash,
  137. story:this.story?.toPointer(),
  138. })
  139. docObj.save();
  140. })
  141. return documentRes;
  142. }
  143. async destoryAllDocument(){
  144. if(this.story?.id){
  145. let query = new CloudQuery("Document");
  146. query.equalTo("story",this.story?.id);
  147. let docList = await query.find();
  148. docList.forEach(doc=>{
  149. doc.destroy();
  150. })
  151. }
  152. }
  153. }
  154. export async function fetchFileBuffer(url: string): Promise<Buffer> {
  155. const response = await fetch(url);
  156. if (!response.ok) {
  157. throw new Error(`Failed to fetch file: ${response.status} ${response.statusText}`);
  158. }
  159. const arrayBuffer = await response.arrayBuffer();
  160. return Buffer.from(arrayBuffer);
  161. }
  162. async function setBackend(){
  163. let backend
  164. let WebGPU = (navigator as any).gpu
  165. if (WebGPU) {
  166. // WebGPU is supported
  167. // console.log(WebGPU)
  168. backend = "webgpu"
  169. } else {
  170. // WebGPU is not supported
  171. }
  172. let glcanvas = document.createElement('canvas');
  173. let WebGL = glcanvas.getContext('webgl') || glcanvas.getContext('experimental-webgl');
  174. if (WebGL) {
  175. // console.log(WebGL)
  176. // WebGL is supported
  177. if(!backend) backend = "webgl"
  178. } else {
  179. // WebGL is not supported
  180. }
  181. if (typeof WebAssembly === 'object' && typeof WebAssembly.instantiate === 'function') {
  182. // WebAssembly is supported
  183. // console.log(WebAssembly)
  184. if(!backend) backend = "wasm"
  185. } else {
  186. // WebAssembly is not supported
  187. }
  188. backend&&await tf.setBackend(backend);
  189. await tf.ready();
  190. return
  191. }
  192. export async function arrayBufferToHASH(arrayBuffer:any) {
  193. // 使用 SubtleCrypto API 计算哈希
  194. const hashBuffer = await crypto.subtle.digest('SHA-256', arrayBuffer); // 使用 SHA-256 代替 MD5
  195. const hashArray = Array.from(new Uint8Array(hashBuffer)); // 将缓冲区转换为字节数组
  196. const hashHex = hashArray.map(b => ('00' + b.toString(16)).slice(-2)).join(''); // 转换为十六进制字符串
  197. return hashHex;
  198. }
  199. export function stringToArrayBuffer(str:string) {
  200. // 创建一个与字符串长度相同的Uint8Array
  201. const encoder = new TextEncoder();
  202. return encoder.encode(str).buffer;
  203. }
  204. export async function EmbedQuery(str:any):Promise<Array<number>>{
  205. const embeddings = new TensorFlowEmbeddings();
  206. let documentRes = await embeddings.embedQuery(str);
  207. return documentRes
  208. }
  209. /** 向量余弦相似度计算 */
  210. export function RetriveAllDocument(vector1: Array<number>, docList: Array<any>): Array<any> {
  211. docList.forEach(doc => {
  212. const vector512 = doc.vector512;
  213. doc.similarity = cosineSimilarity(vector1, vector512); // 计算余弦相似度并存储
  214. });
  215. // 按照相似度排序,降序排列
  216. docList.sort((a, b) => b.similarity - a.similarity);
  217. return docList; // 返回排序后的docList
  218. }
  219. function dotProduct(vectorA: number[], vectorB: number[]): number {
  220. return vectorA.reduce((sum, value, index) => sum + value * vectorB[index], 0);
  221. }
  222. function magnitude(vector: number[]): number {
  223. return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
  224. }
  225. function cosineSimilarity(vectorA: number[], vectorB: number[]): number {
  226. const dotProd = dotProduct(vectorA, vectorB);
  227. const magnitudeA = magnitude(vectorA);
  228. const magnitudeB = magnitude(vectorB);
  229. if (magnitudeA === 0 || magnitudeB === 0) {
  230. throw new Error("One or both vectors are zero vectors, cannot compute cosine similarity.");
  231. }
  232. return dotProd / (magnitudeA * magnitudeB);
  233. }