当前位置:   article > 正文

访问微博热搜榜,获取微博热搜榜前50条热搜名称、链接及其实时热度,并将获取到的数据以邮件的形式发送,每20秒一次发送到个人邮箱中。_访问热搜榜并发送邮件

访问热搜榜并发送邮件

一、需求

访问微博热搜榜(Sina Visitor System),获取微博热搜榜前50条热搜名称、链接及其实时热度,并将获取到的数据通过邮件的形式,每20秒发送到个人邮箱中。

注意事项:

  1. 定义请求头

本实验需要获取User-Agent、Accept、Accept-Language、Cookie四个字段,前三个字段可能都是相同的,主要是Cookie不同。具体获取流程如下:

 打开目标网页,本实验目标网页为Sina Visitor System

按键盘上面F12进入开发者模式,此时页面如下:

 按键盘上面F5刷新页面,此时开发者模式中会有网页相关信息,页面如下:

依次点击Network、All、以及summary(即目标链接的地址),各个位置如下图所示:

点击summary后出现右侧窗口,点击Header能够得到相关报文字段,如下图所示:

cookie获取

  1. def job():
  2. print('**************开始爬取微博热搜**************')
  3. header = {
  4. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36',
  5. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  6. 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
  7. 'Cookie':#填上自己的
  8. }
  9. url = 'https://s.weibo.com/top/summary'
  10. html = page_request(url=url, header=header)
  11. page_parse(html)

邮件发送程序。

以网易邮箱为例,开通自己的授权码,QQ邮箱同理。

登录自己的网易邮箱

会生成一个授权码,你把那个授权码填到代码里就可以了。

  1. class Email163(object):
  2. HOST = 'smtp.qq.com'#网易邮箱是 smtp.163.com
  3. PORT = '25'#默认的不用改
  4. PASSWORD = 'XXXXXXXXX'填写自己的授权码
  5. FROM_ADDR = 'XXX@qq.com'#填上自己的邮箱
  6. SUBTYPE_PLAIN = 'plain'
  7. SUBTYPE_HTML = 'html'
  8. ATTACHMENT = 'attachment'
  9. EMBEDDED = 'embedded'
  10. def __init__(self, body: str, to_addrs: str) -> None:
  11. msg = MIMEText(body, self.SUBTYPE_PLAIN, 'utf-8')
  12. msg['From'] = self.FROM_ADDR
  13. msg['To'] = to_addrs
  14. # 设置邮件的格式以及发送主题
  15. msg['subject'] = Header('微博热搜', 'utf-8')
  16. self.msg = msg.as_string()
  17. self.to_addrs = to_addrs
  18. def send_default_email(self) -> None:
  19. try:
  20. smtp = smtplib.SMTP()
  21. smtp.connect(self.HOST, self.PORT)
  22. smtp.login(self.FROM_ADDR, self.PASSWORD)
  23. smtp.sendmail(self.FROM_ADDR, self.to_addrs, self.msg)
  24. smtp.close()
  25. print(f'邮件成功发送给:{self.to_addrs}')
  26. except smtplib.SMTPException:
  27. raise Exception(f'给{self.to_addrs}发送邮件失败')

完整代码

  1. # 爬虫相关模块
  2. from bs4 import BeautifulSoup
  3. # 发送邮箱相关模块
  4. import smtplib
  5. from email.mime.text import MIMEText
  6. from email.header import Header
  7. import urllib.request
  8. # 定时模块
  9. import schedule
  10. import time
  11. # 请求网页
  12. import urllib.request
  13. import gzip
  14. class Email163(object):
  15. HOST = 'smtp.163.com'
  16. PORT = '25'
  17. PASSWORD = 'XXXXXXXXXXX'#授权码
  18. FROM_ADDR = '自己的邮箱'#填自己的
  19. SUBTYPE_PLAIN = 'plain'
  20. SUBTYPE_HTML = 'html'
  21. ATTACHMENT = 'attachment'
  22. EMBEDDED = 'embedded'
  23. def __init__(self, body: str, to_addrs: str) -> None:
  24. msg = MIMEText(body, self.SUBTYPE_PLAIN, 'utf-8')
  25. msg['From'] = self.FROM_ADDR
  26. msg['To'] = to_addrs
  27. # 设置邮件的格式以及发送主题
  28. msg['subject'] = Header('微博热搜', 'utf-8')
  29. self.msg = msg.as_string()
  30. self.to_addrs = to_addrs
  31. def send_default_email(self) -> None:
  32. try:
  33. smtp = smtplib.SMTP()
  34. smtp.connect(self.HOST, self.PORT)
  35. smtp.login(self.FROM_ADDR, self.PASSWORD)
  36. smtp.sendmail(self.FROM_ADDR, self.to_addrs, self.msg)
  37. smtp.close()
  38. print(f'邮件成功发送给:{self.to_addrs}')
  39. except smtplib.SMTPException:
  40. raise Exception(f'给{self.to_addrs}发送邮件失败')
  41. def page_request(url, header):
  42. request = urllib.request.Request(url, headers=header)
  43. html = ''
  44. try:
  45. response = urllib.request.urlopen(request)
  46. if response.info().get('Content-Encoding') == 'gzip':
  47. # 如果响应使用gzip压缩,则解压缩数据
  48. compressed_data = response.read()
  49. decompressed_data = gzip.decompress(compressed_data)
  50. html = decompressed_data.decode('utf-8')
  51. else:
  52. html = response.read().decode('utf-8')
  53. except urllib.error.URLError as e:
  54. if hasattr(e, 'code'):
  55. print(e.code)
  56. if hasattr(e, 'reason'):
  57. print(e.reason)
  58. return html
  59. # 解析网页
  60. def page_parse(html):
  61. soup = BeautifulSoup(html, 'lxml')
  62. news = []
  63. # 处理热搜前50
  64. urls_title = soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a')
  65. hotness = soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > span')
  66. for i in range(len(urls_title)):
  67. new = {}
  68. title = urls_title[i].get_text()
  69. url = urls_title[i].get('href')
  70. # 个别链接会出现异常
  71. if url == 'javascript:void(0);':
  72. url = urls_title[i].get('href_to')
  73. # 热搜top没有显示热度
  74. if i == 0:
  75. hot = 'top'
  76. else:
  77. hot = hotness[i - 1].get_text()
  78. new['title'] = title
  79. new['url'] = "https://s.weibo.com" + url
  80. new['hot'] = hot
  81. news.append(new)
  82. print(len(news))
  83. print(news)
  84. for element in news:
  85. print(element['title'] + '\t' + element['hot'] + '\t' + element['url'])
  86. content = ''
  87. for i in range(len(news)):
  88. content += str(i) + '、\t' + news[i]['title'] + '\t' + '热度:' + news[i]['hot'] + '\t' + '链接:' + news[i][
  89. 'url'] + ' \n'
  90. get_time = time.strftime('%Y-%m-%d %X', time.localtime(time.time())) + '\n'
  91. content += '获取事件时间为' + get_time
  92. to_addrs = '接收邮箱'#填上自己的
  93. email163 = Email163(content, to_addrs)
  94. email163.send_default_email()
  95. def job():
  96. print('**************开始爬取微博热搜**************')
  97. header = {
  98. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36',
  99. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  100. 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
  101. 'Cookie':'自己的cookie'#填自己的
  102. }
  103. url = 'https://s.weibo.com/top/summary'
  104. html = page_request(url=url, header=header)
  105. page_parse(html)
  106. if __name__ == "__main__":
  107. # 定时爬取,每隔20s爬取一次微博热搜榜并将爬取结果发送至个人邮箱
  108. # 可以将20修改成其他时间
  109. schedule.every(20).seconds.do(job)
  110. while True:
  111. schedule.run_pending()
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/102206
推荐阅读
相关标签
  

闽ICP备14008679号