const cheerio = require('cheerio'); const {Document} = require('langchain/document'); class WebpageLoader { url constructor(url){ this.url = url } async load(){ let response = await fetch(this.url, { "headers": { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "cache-control": "max-age=0", }, "referrerPolicy": "strict-origin-when-cross-origin", "body": null, "method": "GET", "mode": "cors", "credentials": "include" }); // let response = await fetch(this.url) let pageText = await response?.text() // console.log(pageText) let $ = await cheerio.load(pageText); // 移除非内容标签 $("script,style,meta,link,noscript").remove() // 遍历所有元素 let textAll = "" $('body > *').each(function() { const attributes = this.attribs; // 移除所有属性 for (const attr in attributes) { $(this).removeAttr(attr); } textAll += $(this).text() + "\n" // textMap[text] = true }); // console.log(textAll) // return [] return [new Document({ pageContent:textAll, metadata:{ source:this.url } })] } } module.exports.WebpageLoader = WebpageLoader