12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- /**
- * npm i gpt-tokenizer -S
- */
- const {
- encode,
- encodeChat,
- decode,
- isWithinTokenLimit,
- encodeGenerator,
- decodeGenerator,
- decodeAsyncGenerator,
- } = require('gpt-tokenizer')
- const {
- SupportedTextSplitterLanguages,
- RecursiveCharacterTextSplitter,
- TokenTextSplitter
- } = require("langchain/text_splitter");
- // const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
- var mammoth = require("mammoth");
- const fs = require('fs');
- const pdf = require('pdf-parse');
- async function main(){
-
- // 文本提取文本块
- let html = await docsLoader("../data/pgvector.docx")
- const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
- chunkSize: 4096,
- chunkOverlap: 20,
- });
- const output = await splitter.createDocuments([html]);
- // console.log(output);
- let content = output[0].pageContent;
- console.log(content);
- // Encode text into tokens
- const tokens = encode(content)
- console.log(tokens)
- // const embeddings = new OpenAIEmbeddings({});
- // let vector = embeddings.embed_query(content)
- }
- main()
- async function docsLoader(path){
- return new Promise(resolve=>{
- mammoth.convertToHtml({path: path})
- .then(function(result){
- var html = result.value; // The generated HTML
- var messages = result.messages; // Any messages, such as warnings during conversion
- console.log(html)
- console.log(messages)
- resolve(html)
- })
- .catch(function(error) {
- console.error(error);
- });
- })
- }
- async function pdfLoader(path){
-
- let dataBuffer = fs.readFileSync(path);
- return new Promise(resolve=>{
- pdf(dataBuffer).then(function(data) {
-
- // number of pages
- console.log(data.numpages);
- // number of rendered pages
- console.log(data.numrender);
- // PDF info
- console.log(data.info);
- // PDF metadata
- console.log(data.metadata);
- // PDF.js version
- // check https://mozilla.github.io/pdf.js/getting_started/
- console.log(data.version);
- // PDF text
- console.log(data.text);
- resolve(data)
- });
- })
- }
|