/** * npm i gpt-tokenizer -S */ const { encode, encodeChat, decode, isWithinTokenLimit, encodeGenerator, decodeGenerator, decodeAsyncGenerator, } = require('gpt-tokenizer') const { SupportedTextSplitterLanguages, RecursiveCharacterTextSplitter, TokenTextSplitter } = require("langchain/text_splitter"); // const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); var mammoth = require("mammoth"); const fs = require('fs'); const pdf = require('pdf-parse'); async function main(){ // 文本提取文本块 let html = await docsLoader("../data/pgvector.docx") const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", { chunkSize: 4096, chunkOverlap: 20, }); const output = await splitter.createDocuments([html]); // console.log(output); let content = output[0].pageContent; console.log(content); // Encode text into tokens const tokens = encode(content) console.log(tokens) // const embeddings = new OpenAIEmbeddings({}); // let vector = embeddings.embed_query(content) } main() async function docsLoader(path){ return new Promise(resolve=>{ mammoth.convertToHtml({path: path}) .then(function(result){ var html = result.value; // The generated HTML var messages = result.messages; // Any messages, such as warnings during conversion console.log(html) console.log(messages) resolve(html) }) .catch(function(error) { console.error(error); }); }) } async function pdfLoader(path){ let dataBuffer = fs.readFileSync(path); return new Promise(resolve=>{ pdf(dataBuffer).then(function(data) { // number of pages console.log(data.numpages); // number of rendered pages console.log(data.numrender); // PDF info console.log(data.info); // PDF metadata console.log(data.metadata); // PDF.js version // check https://mozilla.github.io/pdf.js/getting_started/ console.log(data.version); // PDF text console.log(data.text); resolve(data) }); }) }