1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- const {
- SupportedTextSplitterLanguages,
- RecursiveCharacterTextSplitter,
- TokenTextSplitter
- } = require("langchain/text_splitter");
- var mammoth = require("mammoth");
- const fs = require('fs');
- const pdf = require('pdf-parse');
- async function main(){
-
- // 加载器提取纯文本
- let data = await pdfLoader("../data/pgvector.pdf")
- let text = data.text;
- console.log(text)
- const splitter = new TokenTextSplitter({
- encodingName: "gpt2",
- chunkSize: 500,
- chunkOverlap: 0,
- });
- const output = await splitter.createDocuments([text]);
- console.log(output)
-
- // 文本提取文本块
- let html = await docsLoader("../data/pgvector.docx")
- const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
- chunkSize: 4096,
- chunkOverlap: 20,
- });
- const output = await splitter.createDocuments([html]);
-
- console.log(output);
- console.log(JSON.stringify(output[0]));
-
- }
- main()
- async function docsLoader(path){
- return new Promise(resolve=>{
- mammoth.convertToHtml({path: path})
- .then(function(result){
- var html = result.value; // The generated HTML
- var messages = result.messages; // Any messages, such as warnings during conversion
- console.log(html)
- console.log(messages)
- resolve(html)
- })
- .catch(function(error) {
- console.error(error);
- });
- })
- }
- async function pdfLoader(path){
-
- let dataBuffer = fs.readFileSync(path);
- return new Promise(resolve=>{
- pdf(dataBuffer).then(function(data) {
-
- // number of pages
- console.log(data.numpages);
- // number of rendered pages
- console.log(data.numrender);
- // PDF info
- console.log(data.info);
- // PDF metadata
- console.log(data.metadata);
- // PDF.js version
- // check https://mozilla.github.io/pdf.js/getting_started/
- console.log(data.version);
- // PDF text
- console.log(data.text);
- resolve(data)
- });
- })
- }
|