web.loader.js 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. const cheerio = require('cheerio');
  2. const {Document} = require('langchain/document');
  3. class WebpageLoader {
  4. url
  5. constructor(url){
  6. this.url = url
  7. }
  8. async load(){
  9. let response = await fetch(this.url, {
  10. "headers": {
  11. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  12. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  13. "cache-control": "max-age=0",
  14. },
  15. "referrerPolicy": "strict-origin-when-cross-origin",
  16. "body": null,
  17. "method": "GET",
  18. "mode": "cors",
  19. "credentials": "include"
  20. });
  21. // let response = await fetch(this.url)
  22. let pageText = await response?.text()
  23. // console.log(pageText)
  24. let $ = await cheerio.load(pageText);
  25. // 移除非内容标签
  26. $("script,style,meta,link,noscript").remove()
  27. // 遍历所有元素
  28. let textAll = ""
  29. $('body > *').each(function() {
  30. const attributes = this.attribs; // 移除所有属性
  31. for (const attr in attributes) {
  32. $(this).removeAttr(attr);
  33. }
  34. textAll += $(this).text() + "\n"
  35. // textMap[text] = true
  36. });
  37. // console.log(textAll)
  38. // return []
  39. return [new Document({
  40. pageContent:textAll,
  41. metadata:{
  42. source:this.url
  43. }
  44. })]
  45. }
  46. }
  47. module.exports.WebpageLoader = WebpageLoader