test.embed.js 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. /**
  2. * npm i gpt-tokenizer -S
  3. */
  4. const {
  5. encode,
  6. encodeChat,
  7. decode,
  8. isWithinTokenLimit,
  9. encodeGenerator,
  10. decodeGenerator,
  11. decodeAsyncGenerator,
  12. } = require('gpt-tokenizer')
  13. const {
  14. SupportedTextSplitterLanguages,
  15. RecursiveCharacterTextSplitter,
  16. TokenTextSplitter
  17. } = require("langchain/text_splitter");
  18. // const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
  19. var mammoth = require("mammoth");
  20. const fs = require('fs');
  21. const pdf = require('pdf-parse');
  22. async function main(){
  23. // 文本提取文本块
  24. let html = await docsLoader("../data/pgvector.docx")
  25. const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
  26. chunkSize: 4096,
  27. chunkOverlap: 20,
  28. });
  29. const output = await splitter.createDocuments([html]);
  30. // console.log(output);
  31. let content = output[0].pageContent;
  32. console.log(content);
  33. // Encode text into tokens
  34. const tokens = encode(content)
  35. console.log(tokens)
  36. // const embeddings = new OpenAIEmbeddings({});
  37. // let vector = embeddings.embed_query(content)
  38. }
  39. main()
  40. async function docsLoader(path){
  41. return new Promise(resolve=>{
  42. mammoth.convertToHtml({path: path})
  43. .then(function(result){
  44. var html = result.value; // The generated HTML
  45. var messages = result.messages; // Any messages, such as warnings during conversion
  46. console.log(html)
  47. console.log(messages)
  48. resolve(html)
  49. })
  50. .catch(function(error) {
  51. console.error(error);
  52. });
  53. })
  54. }
  55. async function pdfLoader(path){
  56. let dataBuffer = fs.readFileSync(path);
  57. return new Promise(resolve=>{
  58. pdf(dataBuffer).then(function(data) {
  59. // number of pages
  60. console.log(data.numpages);
  61. // number of rendered pages
  62. console.log(data.numrender);
  63. // PDF info
  64. console.log(data.info);
  65. // PDF metadata
  66. console.log(data.metadata);
  67. // PDF.js version
  68. // check https://mozilla.github.io/pdf.js/getting_started/
  69. console.log(data.version);
  70. // PDF text
  71. console.log(data.text);
  72. resolve(data)
  73. });
  74. })
  75. }