Browse Source

feat: text loader splitter embed

未来全栈 3 months ago
parent
commit
2ec9ba25cc

+ 3 - 0
rag-server/face-feat68/README.md

@@ -1,5 +1,8 @@
 
 # RAG示例:人脸向量数据库
+
+- pgvector 介绍文档 https://cheatsheet.md/zh/vector-database/pgvector.zh
+
 # NODE环境
 ``` bash
 npm install canvas --canvas_binary_host_mirror=https://registry.npmmirror.com/-/binary/canvas

File diff suppressed because it is too large
+ 1093 - 0
rag-server/package-lock.json


+ 5 - 0
rag-server/package.json

@@ -9,6 +9,11 @@
   "license": "ISC",
   "description": "",
   "dependencies": {
+    "compressing": "^1.10.1",
+    "gpt-tokenizer": "^2.8.1",
+    "langchain": "^0.3.7",
+    "mammoth": "^1.8.0",
+    "pdf-parse": "^1.1.1",
     "pg-promise": "^11.10.2"
   }
 }

BIN
rag-server/rag-loaders/data/pgvector.docx


BIN
rag-server/rag-loaders/data/pgvector.pdf


BIN
rag-server/rag-loaders/data/~$vector.docx


+ 85 - 0
rag-server/rag-loaders/test/test.embed.js

@@ -0,0 +1,85 @@
+/**
+ * npm i gpt-tokenizer -S
+ */
+const {
+    encode,
+    encodeChat,
+    decode,
+    isWithinTokenLimit,
+    encodeGenerator,
+    decodeGenerator,
+    decodeAsyncGenerator,
+  } = require('gpt-tokenizer')
+const {
+    SupportedTextSplitterLanguages,
+    RecursiveCharacterTextSplitter,
+    TokenTextSplitter
+  } = require("langchain/text_splitter");
+
+// const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
+
+var mammoth = require("mammoth");
+const fs = require('fs');
+const pdf = require('pdf-parse');
+
+async function main(){
+  
+    // 文本提取文本块
+    let html = await docsLoader("../data/pgvector.docx")
+    const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
+        chunkSize: 4096,
+        chunkOverlap: 20,
+      });
+    const output = await splitter.createDocuments([html]);
+    // console.log(output);
+    let content = output[0].pageContent;
+    console.log(content);
+    // Encode text into tokens
+    const tokens = encode(content)
+    console.log(tokens)
+    // const embeddings = new OpenAIEmbeddings({});
+    // let vector = embeddings.embed_query(content)
+}
+main()
+
+
+async function docsLoader(path){
+    return new Promise(resolve=>{
+        mammoth.convertToHtml({path:  path})
+        .then(function(result){
+            var html = result.value; // The generated HTML
+            var messages = result.messages; // Any messages, such as warnings during conversion
+            console.log(html)
+            console.log(messages)
+            resolve(html)
+        })
+        .catch(function(error) {
+            console.error(error);
+        });
+    })
+}
+
+async function pdfLoader(path){
+    
+    let dataBuffer = fs.readFileSync(path);
+    return new Promise(resolve=>{
+        pdf(dataBuffer).then(function(data) {
+        
+            // number of pages
+            console.log(data.numpages);
+            // number of rendered pages
+            console.log(data.numrender);
+            // PDF info
+            console.log(data.info);
+            // PDF metadata
+            console.log(data.metadata); 
+            // PDF.js version
+            // check https://mozilla.github.io/pdf.js/getting_started/
+            console.log(data.version);
+            // PDF text
+            console.log(data.text); 
+            resolve(data)
+        });
+    })
+
+}

+ 57 - 0
rag-server/rag-loaders/test/test.loader.js

@@ -0,0 +1,57 @@
+/**
+ npm install -S langchain
+ npm install mammoth -S
+ npm i compressing -S
+ npm install pdf-parse -S
+
+ */
+var mammoth = require("mammoth");
+
+const fs = require('fs');
+const pdf = require('pdf-parse');
+
+async function main(){
+ 
+    pdfLoader("../data/pgvector.pdf")
+
+
+    docsLoader("../data/pgvector.docx")
+
+        
+}
+main()
+
+async function docsLoader(path){
+    mammoth.convertToHtml({path:  path})
+    .then(function(result){
+        var html = result.value; // The generated HTML
+        var messages = result.messages; // Any messages, such as warnings during conversion
+        console.log(html)
+        console.log(messages)
+    })
+    .catch(function(error) {
+        console.error(error);
+    });
+}
+async function pdfLoader(path){
+    
+    let dataBuffer = fs.readFileSync(path);
+    
+    pdf(dataBuffer).then(function(data) {
+    
+        // number of pages
+        console.log(data.numpages);
+        // number of rendered pages
+        console.log(data.numrender);
+        // PDF info
+        console.log(data.info);
+        // PDF metadata
+        console.log(data.metadata); 
+        // PDF.js version
+        // check https://mozilla.github.io/pdf.js/getting_started/
+        console.log(data.version);
+        // PDF text
+        console.log(data.text); 
+            
+    });
+}

+ 80 - 0
rag-server/rag-loaders/test/test.splitter.js

@@ -0,0 +1,80 @@
+
+const {
+    SupportedTextSplitterLanguages,
+    RecursiveCharacterTextSplitter,
+    TokenTextSplitter
+  } = require("langchain/text_splitter");
+
+var mammoth = require("mammoth");
+const fs = require('fs');
+const pdf = require('pdf-parse');
+
+async function main(){
+ 
+    // 加载器提取纯文本
+    let data = await pdfLoader("../data/pgvector.pdf")
+    let text = data.text;
+    console.log(text)
+    const splitter = new TokenTextSplitter({
+        encodingName: "gpt2",
+        chunkSize: 500,
+        chunkOverlap: 0,
+    });
+    const output = await splitter.createDocuments([text]);
+    console.log(output)
+  
+    // 文本提取文本块
+    let html = await docsLoader("../data/pgvector.docx")
+    const splitter = RecursiveCharacterTextSplitter.fromLanguage("html", {
+        chunkSize: 4096,
+        chunkOverlap: 20,
+      });
+    const output = await splitter.createDocuments([html]);
+    
+    console.log(output);
+    console.log(JSON.stringify(output[0]));
+        
+}
+main()
+
+
+async function docsLoader(path){
+    return new Promise(resolve=>{
+        mammoth.convertToHtml({path:  path})
+        .then(function(result){
+            var html = result.value; // The generated HTML
+            var messages = result.messages; // Any messages, such as warnings during conversion
+            console.log(html)
+            console.log(messages)
+            resolve(html)
+        })
+        .catch(function(error) {
+            console.error(error);
+        });
+    })
+}
+
+async function pdfLoader(path){
+    
+    let dataBuffer = fs.readFileSync(path);
+    return new Promise(resolve=>{
+        pdf(dataBuffer).then(function(data) {
+        
+            // number of pages
+            console.log(data.numpages);
+            // number of rendered pages
+            console.log(data.numrender);
+            // PDF info
+            console.log(data.info);
+            // PDF metadata
+            console.log(data.metadata); 
+            // PDF.js version
+            // check https://mozilla.github.io/pdf.js/getting_started/
+            console.log(data.version);
+            // PDF text
+            console.log(data.text); 
+            resolve(data)
+        });
+    })
+
+}

+ 10 - 4
src/app/app.routes.ts

@@ -4,13 +4,19 @@ export const routes: Routes = [
   {
     path: '',
     loadChildren: () => import('./tabs/tabs.routes').then((m) => m.routes),
+  },
+   // 向量:文本特征向量
+   {
+      path: "text/embed",
+      loadComponent: () => import('../modules/text/page-text-embed/page-text-embed.component').then(m => m.PageTextEmbedComponent),
+      runGuardsAndResolvers: "always",
   },
   // 向量:面部特征向量
   {
-    path: "face/feat68",
-    loadComponent: () => import('../modules/face/page-feat68/page-feat68.component').then(m => m.PageFeat68Component),
-    runGuardsAndResolvers: "always",
-},
+      path: "face/feat68",
+      loadComponent: () => import('../modules/face/page-feat68/page-feat68.component').then(m => m.PageFeat68Component),
+      runGuardsAndResolvers: "always",
+  },
    // 聊天模块
    {
       path: "chat/session/role/:roleId",

+ 39 - 0
src/modules/text/page-text-embed/page-text-embed.component.html

@@ -0,0 +1,39 @@
+<ion-content>
+  <ion-segment [value]="tab"  (ionChange)="tabChange($event)">
+    <ion-segment-button value="models">
+      <ion-label>模型加载</ion-label>
+    </ion-segment-button>
+    <ion-segment-button value="feat68">
+      <ion-label>特征提取</ion-label>
+    </ion-segment-button>
+    <ion-segment-button value="match">
+      <ion-label>相似对比</ion-label>
+    </ion-segment-button>
+  </ion-segment>
+  @if(tab=="match"){
+   
+  }
+
+  @if(tab=="feat68"){
+   
+  }
+
+  @if(tab=="models"){
+    @for(text of textList;track text.title){
+      <div style="display: flex;flex-direction: column;">
+        <div style="display: flex;">
+          <div style="width: 70%;">{{text?.title}}.{{text?.content}} </div>
+          <div style="flex:1;display: flex; flex-direction: column;">
+            <ion-button (click)="getTextVector(text)">特征向量</ion-button>
+            @if(text?.vector){
+              <div>向量 {{text?.vector}}</div>
+              <!-- <ion-button (click)="selectText(text)">选中</ion-button> -->
+            }
+          </div>
+        </div>
+
+      </div>
+    }
+  }
+
+</ion-content>

+ 0 - 0
src/modules/text/page-text-embed/page-text-embed.component.scss


+ 22 - 0
src/modules/text/page-text-embed/page-text-embed.component.spec.ts

@@ -0,0 +1,22 @@
+import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing';
+
+import { PageTextEmbedComponent } from './page-text-embed.component';
+
+describe('PageTextEmbedComponent', () => {
+  let component: PageTextEmbedComponent;
+  let fixture: ComponentFixture<PageTextEmbedComponent>;
+
+  beforeEach(waitForAsync(() => {
+    TestBed.configureTestingModule({
+      imports: [PageTextEmbedComponent],
+    }).compileComponents();
+
+    fixture = TestBed.createComponent(PageTextEmbedComponent);
+    component = fixture.componentInstance;
+    fixture.detectChanges();
+  }));
+
+  it('should create', () => {
+    expect(component).toBeTruthy();
+  });
+});

+ 55 - 0
src/modules/text/page-text-embed/page-text-embed.component.ts

@@ -0,0 +1,55 @@
+import { Component, OnInit } from '@angular/core';
+import { DomSanitizer, SafeResourceUrl } from '@angular/platform-browser';
+import { Router } from '@angular/router';
+import { CommonModule } from '@angular/common';
+    
+import { IonContent,IonButton,IonSegment,IonSegmentButton,IonLabel } from "@ionic/angular/standalone";
+import { CloudObject, CloudQuery } from 'src/lib/ncloud';
+
+@Component({
+  selector: 'app-page-text-embed',
+  templateUrl: './page-text-embed.component.html',
+  styleUrls: ['./page-text-embed.component.scss'],
+  standalone: true,
+  imports:[
+    CommonModule,
+    IonContent,IonButton,
+    IonSegment,IonSegmentButton,IonLabel
+    
+  ]
+})
+export class PageTextEmbedComponent  implements OnInit {
+
+  tab:string = "models"
+  tabChange(ev:any){
+    this.tab = ev.detail.value
+  }
+  constructor(
+    private router: Router,
+    ) { }
+
+  ngOnInit() {
+  }
+  
+  back(){
+    this.router.navigate(["/mirror/login"])
+  }
+
+
+  /**
+   * 特征提取:文本特征向量
+   */
+  textList:Array<any> = [
+    {title:"lyf1",content:`content is lyf1`},
+    {title:"lyf2",content:`content is lyf2`},
+    {title:"ym11",content:`content is ym11`},
+    {title:"ym2",content:`content is ym2`},
+    {title:"gtl1",content:`content is gtl1`},
+    {title:"gtl2",content:`content is gtl2`},
+    {title:"gtl3",content:`content is gtl3`},
+  ]
+  getTextVector(text:any){
+    let content = text?.content;
+    
+  }
+}

Some files were not shown because too many files changed in this diff