赞
踩
大家好,我是小码农。今天我要和大家分享一个有趣的项目,那就是使用Python爬取今日头条下面的热点文章。
随着互联网的快速发展,人们对于时事新闻和热门话题的关注度越来越高。而今日头条作为国内知名的新闻平台,汇集了大量的热点文章,每天都有大量的新闻内容产生。因此,我们希望能够利用Python编写一个爬虫程序,自动化地获取今日头条下面的热点文章,以便我们更好地了解当下的热门话题和社会热点。
首先,我们需要导入一些必要的库和模块:
- import bag
- import re
- from bs4 import BeautifulSoup
- from urllib.parse import quote
- import requests
然后,我们定义了一个函数get_hot_list()
,用于获取今日头条的热点文章列表:
- def get_hot_list():
- url = r'https://api.vvhan.com/api/hotlist/toutiao'
- resp = session.get(url)
- resp.encoding = 'utf8'
- resp.close() # 养成好习惯,请求完记得关闭连接
- result = []
- for ls in resp.json().get('data'):
- result.append({ls.get('title'): ls.get('hot')})
- return result
接下来,我们定义了一个函数Get_relevant_articles(dic)
,用于获取与指定热点话题相关的文章:
- def Get_relevant_articles(dic):
- url = r'https://so.toutiao.com/search?dvpf=pc&source=input&keyword={}'.format(quote(list(dic)[0], encoding='utf8'))
- headers = {
- "Referer": url,
- "Host": "so.toutiao.com"
- }
- session.headers = headers
- session.cookies[''] = r'xxxxxxxxxxxxxxxxxxx'
- resp = session.get(url)
- resp.encoding = 'utf-8'
- resp.close()
- url_list = [[i[0].replace('\\u003c', '').replace('em', '').replace('\\u003e', '').replace('/', ''), i[1]] for i in re.findall(r'"title":"(.*?)".*?"share_url":"(.*?)"', resp.text) if i[0] != '']
- title = re.compile(r'<strong>(.*?)</strong>', re.S)
- result = []
- for ls in url_list:
- try:
- resp1 = requests.get(ls[-1])
- resp1.close()
- soup = BeautifulSoup(resp1.text, 'html.parser')
- html = soup.findAll('div', class_='a-con')
- mid = []
- for p in html:
- mid.extend(re.findall(r'<p>(.*?)</p>', str(p)))
- result.append([re.findall(title, resp1.text)[0], '\n'.join(mid)])
- except Exception as e:
- pass
-
- # bag.Bag.save_excel(result, './头条热点文章.xlsx') # 保存文章
最后,我们在main()
函数中调用了以上两个函数,实现了获取热点文章的整个流程:
- def main():
- hos_list = get_hot_list()
- for dic in hos_list[:1]:
- Get_relevant_articles(dic)
- #!/usr/bin/env python3
- # coding:utf-8
- import bag
- import re
- from bs4 import BeautifulSoup
- from urllib.parse import quote
- import requests
-
-
- def main():
- hos_list = get_hot_list()
- for dic in hos_list[:1]:
- Get_relevant_articles(dic)
-
-
- def Get_relevant_articles(dic):
- url = r'https://so.toutiao.com/search?dvpf=pc&source=input&keyword={}'.format(quote(list(dic)[0], encoding='utf8'))
- headers = {
- "Referer": url,
- "Host": "so.toutiao.com"
- }
- session.headers = headers
- session.cookies[''] = r'tt_webid=7349741726641210919; _ga=GA1.1.1593236486.1711245116; _tea_utm_cache_4916=undefined; _S_DPR=1.25; _S_IPAD=0; s_v_web_id=verify_lu4vah8p_O0eJgr0E_sLhQ_4Uvc_9sss_Y5GxuDq6d5ze; msToken=1-tj_F8UanP9ipxwb8AGOtlYFUBckmgeCpbsyLmWl1TLeHmtakVdRA_tar8htpfsa_3-l66NSL7j_b72_X6im2OY9auiliODwSFBFGZg; ttwid=1%7CrTMoH6_equv6Fj5KhisifcjXO0dY3yXbq3dROS5p7oQ%7C1711245342%7Ccebddba5ac70fb0ee50b6642caaa41e0e0466459e2cbbd2ea69f67ff0b2ca83d; _ga_QEHZPBE5HH=GS1.1.1711245116.1.1.1711246976.0.0.0; _S_WIN_WH=650_608; __ac_nonce=065ff9f2a00b65ed4b389; __ac_signature=_02B4Z6wo00f01JSasJgAAIDDqTOqBst0l9CUurQAAEDdb3; __ac_referer=__ac_blank'
- resp = session.get(url)
- resp.encoding = 'utf-8'
- resp.close()
- url_list = [[i[0].replace('\\u003c', '').replace('em', '').replace('\\u003e', '').replace('/', ''), i[1]] for i in re.findall(r'"title":"(.*?)".*?"share_url":"(.*?)"', resp.text) if i[0] != '']
- title = re.compile(r'<strong>(.*?)</strong>', re.S)
- result = []
- for ls in url_list:
- try:
- resp1 = requests.get(ls[-1])
- resp1.close()
- soup = BeautifulSoup(resp1.text, 'html.parser')
- html = soup.findAll('div', class_='a-con')
- mid = []
- for p in html:
- mid.extend(re.findall(r'<p>(.*?)</p>', str(p)))
- result.append([re.findall(title, resp1.text)[0], '\n'.join(mid)])
- except Exception as e:
- pass
-
- # bag.Bag.save_excel(result, './头条热点文章.xlsx') # 保存文章
-
-
- def get_hot_list():
- url = r'https://api.vvhan.com/api/hotlist/toutiao'
- resp = session.get(url)
- resp.encoding = 'utf8'
- resp.close() # 养成好习惯,请求完记得关闭连接
- result = []
- for ls in resp.json().get('data'):
- result.append({ls.get('title'): ls.get('hot')})
- return result
-
-
- if __name__ == '__main__':
- session = bag.session.create_session()
- session.get('https://www.toutiao.com/')
- main()
经过我们的代码实现,我们成功地获取到了今日头条下面的热点文章。通过解析文章的内容,我们可以深入了解热门话题背后的故事。当然,你也可以根据实际需求对获取到的文章进行保存或进一步处理。
通过这个项目,我们学习了如何使用Python爬取今日头条下面的热点文章,并揭秘了热门话题背后的故事。爬虫技术在当今信息时代具有重要的应用价值,希望这个项目能够帮助你更好地理解爬虫技术的实际应用。
感谢大家的阅读!如果你有任何问题或建议,欢迎在评论区留言。谢谢!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。