test.py 4.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. import requests
  2. import time
  3. import pandas as pd
  4. import math
  5. import random
  6. user_agent_list = [
  7. 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  8. 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
  9. 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  10. 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
  11. 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
  12. 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
  13. 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
  14. 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
  15. "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Mobile Safari/537.36",
  16. ]
  17. # 目标url
  18. url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
  19. cookie = "RK=TUH9LG9y6o; ptcz=041daae73db8505dfd0ff96e29a4581909cebbf15066b9bd1558d118bebaa1f5; _uetvid=8fefc26029f111efb982b3356b222011; eas_sid=Y1Y7S1I8z3m3b0t2x9s5u0L8K6; pgv_pvid=4064759573; ua_id=K2pUMGwWe84czx2QAAAAAKSEdKbqPddzcdKmlAE9G5o=; _clck=r2a5fe|1|frz|0; uuid=591f115be24ec02fb5df0c3c3ae4d861; wxuin=35057412413009; cert=KKKZAG5uCQahXHPwzUWIPULSwFInsjS7; _qpsvr_localtk=0.013361729807209288; data_bizuin=3957811830; bizuin=3957811830; master_user=gh_f39bb2700e4a; master_sid=V19WQ1d0OUNtbjRYYURMT2Z2Y1RHRVhRbnZiRHJKeVBudEJfTGkwbVFfT3FsUXRNZ3h1REswanh3Skg1eW5xOUJ0Z0ZvMkUxbUxqR3lpUlNLdU5NMkhkVDF6alNIaFJhb3RKVUFRN0k2bkdienBKQ29objBObnQyYTltMVZxdFNzUWppOVpCSlIxQXlGWHB4; master_ticket=e2eef0a5d458a291a73e7094fb4a1903; data_ticket=Pzv5vtcB+Ss8M4pLuz1aU1XDeC3loe1HlFt9rG+L1nQ1/fTl5pbw8CxB0EQSwkR8; rand_info=CAESIPuH85LjttrzOe5zQa4a1hq/m6bXwcrQqykLgxm0DZ5+; slave_bizuin=3957811830; slave_user=gh_f39bb2700e4a; slave_sid=dkpnOENBV1VGc2FOSVlPeklMU2F2T1c0dkMxQTE5QzdCampCOHVETDBGUmdaSzFPekIzQzJFdTJGbl8xVlRfeU9JaGZaQWpnNk5sV01ySzhDeTlKb3o0UllqOTZnMmpTN3d5VDlfRVdabDlLVEVjenBCaTZpdEc1SnBUVWlrU05qcmtubXZYbXI3U1BxS0xR; _clsk=1eshhim|1735057844548|3|1|mp.weixin.qq.com/weheat-agent/payload/record; rewardsn=; wxtokenkey=777"
  20. # 使用Cookie,跳过登陆操作
  21. data = {
  22. "token": "1299150377",
  23. "lang": "zh_CN",
  24. "f": "json",
  25. "ajax": "1",
  26. "action": "list_ex",
  27. "begin": "0",
  28. "count": "5",
  29. "query": "",
  30. "fakeid": "MzAxODQ1OTM3Mg==",
  31. "type": "9",
  32. }
  33. headers = {
  34. "Cookie": cookie,
  35. "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Mobile Safari/537.36",
  36. }
  37. content_json = requests.get(url, headers=headers, params=data).json()
  38. count = int(content_json["app_msg_cnt"])
  39. print(count)
  40. page = int(math.ceil(count / 5))
  41. print(page)
  42. content_list = []
  43. # 功能:爬取IP存入ip_list列表
  44. for i in range(page):
  45. data["begin"] = i * 5
  46. user_agent = random.choice(user_agent_list)
  47. headers = {
  48. "Cookie": cookie,
  49. "User-Agent": user_agent,
  50. }
  51. ip_headers = {
  52. 'User-Agent': user_agent
  53. }
  54. # 使用get方法进行提交
  55. content_json = requests.get(url, headers=headers, params=data).json()
  56. # 返回了一个json,里面是每一页的数据
  57. for item in content_json["app_msg_list"]:
  58. # 提取每页文章的标题及对应的url
  59. items = []
  60. items.append(item["title"])
  61. items.append(item["link"])
  62. t = time.localtime(item["create_time"])
  63. items.append(time.strftime("%Y-%m-%d %H:%M:%S", t))
  64. content_list.append(items)
  65. print(i)
  66. if (i > 0) and (i % 10 == 0):
  67. name = ['title', 'link', 'create_time']
  68. test = pd.DataFrame(columns=name, data=content_list)
  69. print(test)
  70. test.to_csv("url.csv", mode='a', encoding='utf-8')
  71. print("第" + str(i) + "次保存成功")
  72. content_list = []
  73. time.sleep(random.randint(60,90))
  74. else:
  75. time.sleep(random.randint(15,25))
  76. name = ['title', 'link', 'create_time']
  77. test = pd.DataFrame(columns=name, data=content_list)
  78. print(test)
  79. test.to_csv("url.csv", mode='a', encoding='utf-8')
  80. print("最后一次保存成功")