123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- const cheerio = require('cheerio');
- const { GushiHrefList } = require("./href")
- // 引用Parse JS SDK
- const Parse = require("parse/node");
- Parse.initialize("dev"); // 设置applicationId
- Parse.serverURL = "http://web2023.fmode.cn:9999/parse"; // 设置serverURL
- // 实现分页抓取数据,并保存
- main()
- async function main(){
- // let list = [GushiHrefList[0]]
- let list = GushiHrefList
- list.forEach(async href=>{
- let gushi = await getGushiDataFromHref(href);
- if(!gushi?.title || !gushi?.author) return;
- // 查重
- let exists = await checkExists(gushi);
- if(exists?.id) return
- // 新增
- let Shige = Parse.Object.extend("Shige");
- let sg = new Shige();
- sg.set(gushi);
- sg.save();
- })
- }
- async function checkExists(gushi){
- let query = new Parse.Query("Shige");
- query.equalTo("title",gushi?.title);
- query.equalTo("author",gushi?.author);
- query.equalTo("dynasty",gushi?.dynasty);
- await query.first();
- return await query.first();
- }
- async function getGushiDataFromHref(href){
- let gushi = {}
- let response
- try{
- response = await fetch(href);
- }catch(err){
- console.log("失败:",href)
- return {}
- }
- let html = await response.text()
- const $ = cheerio.load(html);
- gushi.title = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > h1').text();
- gushi.author = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > div.author-simple-info > span:nth-child(3) > a').text();
- gushi.dynasty = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > div.author-simple-info > span:nth-child(1) > a').text();
- gushi.content = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(1) > div.shici-content.check-more').html();
- gushi.intro = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(2)').html();
- gushi.yiwen = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(4)').html();
- gushi.note = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(6)').html()
- gushi.review = $('body > div.container.basic-page > div > div.more-container.col-md-8 > div:nth-child(5) > div:nth-child(8)').html()
- console.log(href,gushi)
- return gushi
- }
|