story.loader.ts 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. // import pdf from 'pdf-parse';
  2. // import fs from 'fs';
  3. import { CloudApi } from 'src/lib/ncloud';
  4. import mammoth from "mammoth";
  5. import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
  6. import { Document } from '@langchain/core/documents';
  7. import * as tf from '@tensorflow/tfjs-core';
  8. // import "@tensorflow/tfjs-backend-cpu";
  9. // import '@tensorflow/tfjs-backend-webgpu';
  10. import '@tensorflow/tfjs-backend-webgl';
  11. // import '@tensorflow/tfjs-backend-wasm';
  12. import { TensorFlowEmbeddings } from "@langchain/community/embeddings/tensorflow";
  13. export class AgentStory{
  14. url:string = ""
  15. content:string = ""
  16. docList:Array<Document> = []
  17. constructor(){
  18. setBackend()
  19. }
  20. async loader(url:string){
  21. let api = new CloudApi();
  22. let result;
  23. if(url?.endsWith(".docx")){
  24. result = await this.loadDocx(url)
  25. }
  26. if(!result){
  27. result = await api.fetch("agent/loader",{url:url})
  28. }
  29. this.content = result?.data || null
  30. if(this.content){
  31. this.url = url
  32. }
  33. return this.content
  34. }
  35. async loadDocx(url:string){
  36. let data:any
  37. const response = await fetch(url);
  38. const arrayBuffer:any = await response.arrayBuffer();
  39. let text;
  40. try {
  41. text = await mammoth.extractRawText({arrayBuffer:arrayBuffer}); // 浏览器 直接传递 arrayBuffer
  42. } catch (err) {
  43. console.error(err);
  44. }
  45. // let html = mammoth.convertToHtml(buffer)
  46. data = text?.value || "";
  47. data = data.replaceAll(/\n+/g,"\n") // 剔除多余换行
  48. return {data}
  49. }
  50. async splitter(options?:{
  51. chunkSize:number,
  52. chunkOverlap:number
  53. }){
  54. if(!this.content) return
  55. // 默认:递归字符文本分割器
  56. let splitter = new RecursiveCharacterTextSplitter({
  57. chunkSize: options?.chunkSize || 500,
  58. chunkOverlap: options?.chunkOverlap || 100,
  59. });
  60. let docOutput = await splitter.splitDocuments([
  61. new Document({ pageContent: this.content }),
  62. ]);
  63. console.log(docOutput)
  64. this.docList = docOutput
  65. return this.docList
  66. }
  67. /**
  68. * 文本向量提取
  69. * @see
  70. * https://js.langchain.com/docs/integrations/text_embedding/tensorflow/
  71. * @returns
  72. */
  73. // embedding vector(1536) NOT NULL -- NOTE: 1536 for ChatGPT
  74. async embedings(){
  75. if(!this.docList?.length){return}
  76. const embeddings = new TensorFlowEmbeddings();
  77. let documentRes = await embeddings.embedDocuments(this.docList?.map(item=>item.pageContent));
  78. console.log(documentRes);
  79. return documentRes;
  80. }
  81. }
  82. export async function fetchFileBuffer(url: string): Promise<Buffer> {
  83. const response = await fetch(url);
  84. if (!response.ok) {
  85. throw new Error(`Failed to fetch file: ${response.status} ${response.statusText}`);
  86. }
  87. const arrayBuffer = await response.arrayBuffer();
  88. return Buffer.from(arrayBuffer);
  89. }
  90. async function setBackend(){
  91. let backend
  92. let WebGPU = (navigator as any).gpu
  93. if (WebGPU) {
  94. // WebGPU is supported
  95. // console.log(WebGPU)
  96. backend = "webgpu"
  97. } else {
  98. // WebGPU is not supported
  99. }
  100. let glcanvas = document.createElement('canvas');
  101. let WebGL = glcanvas.getContext('webgl') || glcanvas.getContext('experimental-webgl');
  102. if (WebGL) {
  103. // console.log(WebGL)
  104. // WebGL is supported
  105. if(!backend) backend = "webgl"
  106. } else {
  107. // WebGL is not supported
  108. }
  109. if (typeof WebAssembly === 'object' && typeof WebAssembly.instantiate === 'function') {
  110. // WebAssembly is supported
  111. // console.log(WebAssembly)
  112. if(!backend) backend = "wasm"
  113. } else {
  114. // WebAssembly is not supported
  115. }
  116. backend&&await tf.setBackend(backend);
  117. await tf.ready();
  118. return
  119. }