|
@@ -0,0 +1,85 @@
|
|
|
+/**
|
|
|
+ * npm i gpt-tokenizer -S
|
|
|
+ */
|
|
|
+const {
|
|
|
+ encode,
|
|
|
+ encodeChat,
|
|
|
+ decode,
|
|
|
+ isWithinTokenLimit,
|
|
|
+ encodeGenerator,
|
|
|
+ decodeGenerator,
|
|
|
+ decodeAsyncGenerator,
|
|
|
+ } = require('gpt-tokenizer')
|
|
|
+const {
|
|
|
+ SupportedTextSplitterLanguages,
|
|
|
+ RecursiveCharacterTextSplitter,
|
|
|
+ TokenTextSplitter
|
|
|
+ } = require("langchain/text_splitter");
|
|
|
+
|
|
|
+// const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
|
|
|
+
|
|
|
+var mammoth = require("mammoth");
|
|
|
+const fs = require('fs');
|
|
|
+const pdf = require('pdf-parse');
|
|
|
+
|
|
|
+async function main(){
|
|
|
+
|
|
|
+ // 文本提取文本块
|
|
|
+ let html = await docsLoader("../data/pgvector.docx")
|
|
|
+ const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
|
|
|
+ chunkSize: 4096,
|
|
|
+ chunkOverlap: 20,
|
|
|
+ });
|
|
|
+ const output = await splitter.createDocuments([html]);
|
|
|
+ // console.log(output);
|
|
|
+ let content = output[0].pageContent;
|
|
|
+ console.log(content);
|
|
|
+ // Encode text into tokens
|
|
|
+ const tokens = encode(content)
|
|
|
+ console.log(tokens)
|
|
|
+ // const embeddings = new OpenAIEmbeddings({});
|
|
|
+ // let vector = embeddings.embed_query(content)
|
|
|
+}
|
|
|
+main()
|
|
|
+
|
|
|
+
|
|
|
+async function docsLoader(path){
|
|
|
+ return new Promise(resolve=>{
|
|
|
+ mammoth.convertToHtml({path: path})
|
|
|
+ .then(function(result){
|
|
|
+ var html = result.value; // The generated HTML
|
|
|
+ var messages = result.messages; // Any messages, such as warnings during conversion
|
|
|
+ console.log(html)
|
|
|
+ console.log(messages)
|
|
|
+ resolve(html)
|
|
|
+ })
|
|
|
+ .catch(function(error) {
|
|
|
+ console.error(error);
|
|
|
+ });
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+async function pdfLoader(path){
|
|
|
+
|
|
|
+ let dataBuffer = fs.readFileSync(path);
|
|
|
+ return new Promise(resolve=>{
|
|
|
+ pdf(dataBuffer).then(function(data) {
|
|
|
+
|
|
|
+ // number of pages
|
|
|
+ console.log(data.numpages);
|
|
|
+ // number of rendered pages
|
|
|
+ console.log(data.numrender);
|
|
|
+ // PDF info
|
|
|
+ console.log(data.info);
|
|
|
+ // PDF metadata
|
|
|
+ console.log(data.metadata);
|
|
|
+ // PDF.js version
|
|
|
+ // check https://mozilla.github.io/pdf.js/getting_started/
|
|
|
+ console.log(data.version);
|
|
|
+ // PDF text
|
|
|
+ console.log(data.text);
|
|
|
+ resolve(data)
|
|
|
+ });
|
|
|
+ })
|
|
|
+
|
|
|
+}
|