12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- import csv
- from bs4 import BeautifulSoup
- import requests
- headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36"
- } # 设置请求头,模拟用户
- url = "https://blog.csdn.net/u010163987?type=blog" # 爬取的网址
- result_list = []
- # 循环爬取每一页,虽然我的博客的文章列表只有三页,但是可以设置多循环,防止未爬取到
- response = requests.get(f'{url}', headers=headers)
- response.encoding = response.apparent_encoding # 设置页面编码
- content = response.content # 获取页面内容
- soup = BeautifulSoup(content, 'html.parser') # 解析
- article_list = soup.find('article', class_='blog-list-box') # 获取到文章列表
- print(article_list)
- result_list = [] # 确保result_list在使用前已初始化
- for article in article_list.find_all('div', class_='list-box-cont'):
- result = {}
- blog_box = article.find_parent('article', class_='blog-list-box') # 找到父级的article元素
- if blog_box:
- title_href = blog_box.find('a') # 在article中查找<a>标签
- if title_href and 'href' in title_href.attrs: # 检查<a>标签是否存在并且有href属性
- result['href'] = title_href['href'] # 文章链接
- else:
- result['href'] = None # 如果没有找到href,设置为None
- else:
- result['href'] = None # 如果没有找到article,设置为None
- # 获取文章标题
- title_div = article.find('div', class_='blog-list-box-top')
- if title_div:
- result['name'] = title_div.text.strip() # 文章标题
- else:
- result['name'] = None # 或者根据需求设置一个默认值
- result_list.append(result) # 将每篇文章的信息添加到列表中
- print(result_list)
- # with open('csdn_paqu.csv', 'w', encoding='utf-8') as f:
- # write = csv.DictWriter(f, fieldnames=[ 'href', 'name']) # 写入CSV文件
- # write.writeheader() # 写入列名
- # write.writerows(result_list) # 将包含字典的列表全部写入到CSV文件中
|