test.splitter.js 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. const {
  2. SupportedTextSplitterLanguages,
  3. RecursiveCharacterTextSplitter,
  4. TokenTextSplitter
  5. } = require("langchain/text_splitter");
  6. var mammoth = require("mammoth");
  7. const fs = require('fs');
  8. const pdf = require('pdf-parse');
  9. async function main(){
  10. // 加载器提取纯文本
  11. let data = await pdfLoader("../data/pgvector.pdf")
  12. let text = data.text;
  13. console.log(text)
  14. const splitter = new TokenTextSplitter({
  15. encodingName: "gpt2",
  16. chunkSize: 500,
  17. chunkOverlap: 0,
  18. });
  19. const output = await splitter.createDocuments([text]);
  20. console.log(output)
  21. // 文本提取文本块
  22. let html = await docsLoader("../data/pgvector.docx")
  23. const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
  24. chunkSize: 4096,
  25. chunkOverlap: 20,
  26. });
  27. const output = await splitter.createDocuments([html]);
  28. console.log(output);
  29. console.log(JSON.stringify(output[0]));
  30. }
  31. main()
  32. async function docsLoader(path){
  33. return new Promise(resolve=>{
  34. mammoth.convertToHtml({path: path})
  35. .then(function(result){
  36. var html = result.value; // The generated HTML
  37. var messages = result.messages; // Any messages, such as warnings during conversion
  38. console.log(html)
  39. console.log(messages)
  40. resolve(html)
  41. })
  42. .catch(function(error) {
  43. console.error(error);
  44. });
  45. })
  46. }
  47. async function pdfLoader(path){
  48. let dataBuffer = fs.readFileSync(path);
  49. return new Promise(resolve=>{
  50. pdf(dataBuffer).then(function(data) {
  51. // number of pages
  52. console.log(data.numpages);
  53. // number of rendered pages
  54. console.log(data.numrender);
  55. // PDF info
  56. console.log(data.info);
  57. // PDF metadata
  58. console.log(data.metadata);
  59. // PDF.js version
  60. // check https://mozilla.github.io/pdf.js/getting_started/
  61. console.log(data.version);
  62. // PDF text
  63. console.log(data.text);
  64. resolve(data)
  65. });
  66. })
  67. }