const { SupportedTextSplitterLanguages, RecursiveCharacterTextSplitter, TokenTextSplitter } = require("langchain/text_splitter"); var mammoth = require("mammoth"); const fs = require('fs'); const pdf = require('pdf-parse'); async function main(){ // 加载器提取纯文本 let data = await pdfLoader("../data/pgvector.pdf") let text = data.text; console.log(text) const splitter = new TokenTextSplitter({ encodingName: "gpt2", chunkSize: 500, chunkOverlap: 0, }); const output = await splitter.createDocuments([text]); console.log(output) // 文本提取文本块 let html = await docsLoader("../data/pgvector.docx") const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", { chunkSize: 4096, chunkOverlap: 20, }); const output = await splitter.createDocuments([html]); console.log(output); console.log(JSON.stringify(output[0])); } main() async function docsLoader(path){ return new Promise(resolve=>{ mammoth.convertToHtml({path: path}) .then(function(result){ var html = result.value; // The generated HTML var messages = result.messages; // Any messages, such as warnings during conversion console.log(html) console.log(messages) resolve(html) }) .catch(function(error) { console.error(error); }); }) } async function pdfLoader(path){ let dataBuffer = fs.readFileSync(path); return new Promise(resolve=>{ pdf(dataBuffer).then(function(data) { // number of pages console.log(data.numpages); // number of rendered pages console.log(data.numrender); // PDF info console.log(data.info); // PDF metadata console.log(data.metadata); // PDF.js version // check https://mozilla.github.io/pdf.js/getting_started/ console.log(data.version); // PDF text console.log(data.text); resolve(data) }); }) }