12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- const {
- encode,
- encodeChat,
- decode,
- isWithinTokenLimit,
- encodeGenerator,
- decodeGenerator,
- decodeAsyncGenerator,
- } = require('gpt-tokenizer')
- const {
- SupportedTextSplitterLanguages,
- RecursiveCharacterTextSplitter,
- TokenTextSplitter
- } = require("langchain/text_splitter");
- var mammoth = require("mammoth");
- const fs = require('fs');
- const pdf = require('pdf-parse');
- async function main(){
-
-
- let html = await docsLoader("../data/pgvector.docx")
- const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
- chunkSize: 4096,
- chunkOverlap: 20,
- });
- const output = await splitter.createDocuments([html]);
-
- let content = output[0].pageContent;
- console.log(content);
-
- const tokens = encode(content)
- console.log(tokens)
-
-
- }
- main()
- async function docsLoader(path){
- return new Promise(resolve=>{
- mammoth.convertToHtml({path: path})
- .then(function(result){
- var html = result.value;
- var messages = result.messages;
- console.log(html)
- console.log(messages)
- resolve(html)
- })
- .catch(function(error) {
- console.error(error);
- });
- })
- }
- async function pdfLoader(path){
-
- let dataBuffer = fs.readFileSync(path);
- return new Promise(resolve=>{
- pdf(dataBuffer).then(function(data) {
-
-
- console.log(data.numpages);
-
- console.log(data.numrender);
-
- console.log(data.info);
-
- console.log(data.metadata);
-
-
- console.log(data.version);
-
- console.log(data.text);
- resolve(data)
- });
- })
- }
|