import.js 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. const cheerio = require('cheerio');
  2. const { GushiHrefList } = require("./href")
  3. // 引用Parse JS SDK
  4. const Parse = require("parse/node");
  5. Parse.initialize("dev"); // 设置applicationId
  6. Parse.serverURL = "http://web2023.fmode.cn:9999/parse"; // 设置serverURL
  7. // 实现分页抓取数据,并保存
  8. main()
  9. async function main(){
  10. // let list = [GushiHrefList[0]]
  11. let list = GushiHrefList
  12. list.forEach(async href=>{
  13. let gushi = await getGushiDataFromHref(href);
  14. if(!gushi?.title || !gushi?.author) return;
  15. // 查重
  16. let exists = await checkExists(gushi);
  17. if(exists?.id) return
  18. // 新增
  19. let Shige = Parse.Object.extend("Shige");
  20. let sg = new Shige();
  21. sg.set(gushi);
  22. sg.save();
  23. })
  24. }
  25. async function checkExists(gushi){
  26. let query = new Parse.Query("Shige");
  27. query.equalTo("title",gushi?.title);
  28. query.equalTo("author",gushi?.author);
  29. query.equalTo("dynasty",gushi?.dynasty);
  30. await query.first();
  31. return await query.first();
  32. }
  33. async function getGushiDataFromHref(href){
  34. let gushi = {}
  35. let response
  36. try{
  37. response = await fetch(href);
  38. }catch(err){
  39. console.log("失败:",href)
  40. return {}
  41. }
  42. let html = await response.text()
  43. const $ = cheerio.load(html);
  44. gushi.title = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > h1').text();
  45. gushi.author = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > div.author-simple-info > span:nth-child(3) > a').text();
  46. gushi.dynasty = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > div.author-simple-info > span:nth-child(1) > a').text();
  47. gushi.content = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > div.shici-content.check-more').html();
  48. gushi.intro = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(2)').html();
  49. gushi.yiwen = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(4)').html();
  50. gushi.note = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(6)').html()
  51. gushi.review = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(8)').html()
  52. console.log(href,gushi)
  53. return gushi
  54. }