1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- const cheerio = require('cheerio');
- const {Document} = require('langchain/document');
- class WebpageLoader {
- url
- constructor(url){
- this.url = url
- }
- async load(){
- let response = await fetch(this.url, {
- "headers": {
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
- "cache-control": "max-age=0",
- },
- "referrerPolicy": "strict-origin-when-cross-origin",
- "body": null,
- "method": "GET",
- "mode": "cors",
- "credentials": "include"
- });
- // let response = await fetch(this.url)
- let pageText = await response?.text()
- // console.log(pageText)
- let $ = await cheerio.load(pageText);
- // 移除非内容标签
- $("script,style,meta,link,noscript").remove()
- // 遍历所有元素
- let textAll = ""
- $('body > *').each(function() {
- const attributes = this.attribs; // 移除所有属性
- for (const attr in attributes) {
- $(this).removeAttr(attr);
- }
- textAll += $(this).text() + "\n"
- // textMap[text] = true
- });
-
- // console.log(textAll)
- // return []
- return [new Document({
- pageContent:textAll,
- metadata:{
- source:this.url
- }
- })]
- }
- }
- module.exports.WebpageLoader = WebpageLoader
|