|  | @@ -0,0 +1,85 @@
 | 
	
		
			
				|  |  | +/**
 | 
	
		
			
				|  |  | + * npm i gpt-tokenizer -S
 | 
	
		
			
				|  |  | + */
 | 
	
		
			
				|  |  | +const {
 | 
	
		
			
				|  |  | +    encode,
 | 
	
		
			
				|  |  | +    encodeChat,
 | 
	
		
			
				|  |  | +    decode,
 | 
	
		
			
				|  |  | +    isWithinTokenLimit,
 | 
	
		
			
				|  |  | +    encodeGenerator,
 | 
	
		
			
				|  |  | +    decodeGenerator,
 | 
	
		
			
				|  |  | +    decodeAsyncGenerator,
 | 
	
		
			
				|  |  | +  } = require('gpt-tokenizer')
 | 
	
		
			
				|  |  | +const {
 | 
	
		
			
				|  |  | +    SupportedTextSplitterLanguages,
 | 
	
		
			
				|  |  | +    RecursiveCharacterTextSplitter,
 | 
	
		
			
				|  |  | +    TokenTextSplitter
 | 
	
		
			
				|  |  | +  } = require("langchain/text_splitter");
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +var mammoth = require("mammoth");
 | 
	
		
			
				|  |  | +const fs = require('fs');
 | 
	
		
			
				|  |  | +const pdf = require('pdf-parse');
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +async function main(){
 | 
	
		
			
				|  |  | +  
 | 
	
		
			
				|  |  | +    // 文本提取文本块
 | 
	
		
			
				|  |  | +    let html = await docsLoader("../data/pgvector.docx")
 | 
	
		
			
				|  |  | +    const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
 | 
	
		
			
				|  |  | +        chunkSize: 4096,
 | 
	
		
			
				|  |  | +        chunkOverlap: 20,
 | 
	
		
			
				|  |  | +      });
 | 
	
		
			
				|  |  | +    const output = await splitter.createDocuments([html]);
 | 
	
		
			
				|  |  | +    // console.log(output);
 | 
	
		
			
				|  |  | +    let content = output[0].pageContent;
 | 
	
		
			
				|  |  | +    console.log(content);
 | 
	
		
			
				|  |  | +    // Encode text into tokens
 | 
	
		
			
				|  |  | +    const tokens = encode(content)
 | 
	
		
			
				|  |  | +    console.log(tokens)
 | 
	
		
			
				|  |  | +    // const embeddings = new OpenAIEmbeddings({});
 | 
	
		
			
				|  |  | +    // let vector = embeddings.embed_query(content)
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +main()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +async function docsLoader(path){
 | 
	
		
			
				|  |  | +    return new Promise(resolve=>{
 | 
	
		
			
				|  |  | +        mammoth.convertToHtml({path:  path})
 | 
	
		
			
				|  |  | +        .then(function(result){
 | 
	
		
			
				|  |  | +            var html = result.value; // The generated HTML
 | 
	
		
			
				|  |  | +            var messages = result.messages; // Any messages, such as warnings during conversion
 | 
	
		
			
				|  |  | +            console.log(html)
 | 
	
		
			
				|  |  | +            console.log(messages)
 | 
	
		
			
				|  |  | +            resolve(html)
 | 
	
		
			
				|  |  | +        })
 | 
	
		
			
				|  |  | +        .catch(function(error) {
 | 
	
		
			
				|  |  | +            console.error(error);
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +    })
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +async function pdfLoader(path){
 | 
	
		
			
				|  |  | +    
 | 
	
		
			
				|  |  | +    let dataBuffer = fs.readFileSync(path);
 | 
	
		
			
				|  |  | +    return new Promise(resolve=>{
 | 
	
		
			
				|  |  | +        pdf(dataBuffer).then(function(data) {
 | 
	
		
			
				|  |  | +        
 | 
	
		
			
				|  |  | +            // number of pages
 | 
	
		
			
				|  |  | +            console.log(data.numpages);
 | 
	
		
			
				|  |  | +            // number of rendered pages
 | 
	
		
			
				|  |  | +            console.log(data.numrender);
 | 
	
		
			
				|  |  | +            // PDF info
 | 
	
		
			
				|  |  | +            console.log(data.info);
 | 
	
		
			
				|  |  | +            // PDF metadata
 | 
	
		
			
				|  |  | +            console.log(data.metadata); 
 | 
	
		
			
				|  |  | +            // PDF.js version
 | 
	
		
			
				|  |  | +            // check https://mozilla.github.io/pdf.js/getting_started/
 | 
	
		
			
				|  |  | +            console.log(data.version);
 | 
	
		
			
				|  |  | +            // PDF text
 | 
	
		
			
				|  |  | +            console.log(data.text); 
 | 
	
		
			
				|  |  | +            resolve(data)
 | 
	
		
			
				|  |  | +        });
 | 
	
		
			
				|  |  | +    })
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +}
 |