当前位置:   article > 正文

头条号爬虫实战_爬虫 头条 文章内容

爬虫 头条 文章内容

1.先去部署selenium+Chrome,具体部署方法,可以去搜索一下,很简单。


2.找到你要爬取的头条号网址,本文以光明网为例。

https://www.toutiao.com/c/user/token/MS4wLjABAAAA9Lz0MeLdJDmqpU26Xi9O_M-cYI9z530wjM7eDKvzZTw/?source=feed&log_from=47e4ed6a059e5_1657954170450


3.根据网页数据去分析获取xpath路径

 

4.根据xpath路径,爬取文章内容,点赞数,文章内容等数据


5.代码实现(有注释)

  1. # 开发人员:小李同学
  2. # 开发日期:2022/7/11 9:07
  3. import openpyxl
  4. from selenium.webdriver import Chrome
  5. import pandas as pd
  6. from selenium.webdriver.common.by import By
  7. import time
  8. from selenium.webdriver.chrome.options import Options
  9. import re
  10. from datetime import datetime, timedelta
  11. ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
  12. # 要爬取头条号的网址
  13. net = input('请输入要访问的头条号的网址:')
  14. num_1 = int(input('请输入滚动的次数(次数越多爬取数据越多,耗时越久):'))
  15. file_name = input('请输入文件名:')
  16. url = net
  17. opt = Options()
  18. # 删除掉Chrome浏览器正在收到自动测试软件的控制
  19. opt.add_experimental_option('excludeSwitches', ['enable-automation'])
  20. # 创建浏览器对象
  21. toutiao = Chrome(options=opt)
  22. toutiao.get(url)
  23. time.sleep(2)
  24. # 点击文章
  25. toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[1]/ul/li[2]').click()
  26. time.sleep(2)
  27. # 滚动条向下滑动
  28. count = 0
  29. while count < num_1:
  30. toutiao.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  31. time.sleep(1)
  32. count += 1
  33. time.sleep(2)
  34. # 滑动回顶部,否则爬取不到文章内容
  35. js = "window.scrollTo(0,0)"
  36. toutiao.execute_script(js)
  37. time.sleep(1)
  38. # 所有文章所在的div
  39. div_list = toutiao.find_elements(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[2]/div/div/div')
  40. time.sleep(2)
  41. date_list = []
  42. # print(div_list)
  43. for div in div_list:
  44. # 标题
  45. title_name = div.find_element(by=By.XPATH, value='./div/div/div/a').text
  46. # 阅读数
  47. read = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[1]').text
  48. # 评论数
  49. comment = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[2]').text
  50. # 发布时间
  51. timess = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[3]').text
  52. """ print('标题:', title_name)
  53. print('阅读数:', read)
  54. print('评论数:', comment)
  55. print('发布时间:', timess)"""
  56. time.sleep(2)
  57. def parseTime(timess):
  58. if ('分钟前' in timess) or ('分鐘前' in timess) or ('minute' in timess) or (
  59. '分鐘' in timess) or ('分前' in timess):
  60. try:
  61. minutes = timess[:timess.find('分鐘')]
  62. minutes = timedelta(minutes=int(minutes))
  63. except:
  64. minutes = timess[:timess.find('分钟前')]
  65. minutes = timedelta(minutes=int(minutes))
  66. # FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d %H:%M')
  67. FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
  68. elif ('mins ago') in timess:
  69. minutes = timess[:timess.find('mins ago')]
  70. minutes = timedelta(minutes=int(minutes))
  71. FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
  72. elif ('hours ag') in timess:
  73. hour = timess[:timess.find('hours ag')]
  74. hour = timedelta(hours=int(hour))
  75. FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
  76. elif ('小时前' in timess) or ('小時前' in timess) or ('hour' in timess) or (
  77. '小時' in timess):
  78. try:
  79. hour = timess[:timess.find('小时前')]
  80. hour = timedelta(hours=int(hour))
  81. except:
  82. hour = timess[:timess.find('小時')]
  83. hour = timedelta(hours=int(hour))
  84. # FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d %H:%M')
  85. FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
  86. elif ('天前' in timess) or ('day' in timess):
  87. day = timess[:timess.find('天前')]
  88. day = timedelta(days=int(day))
  89. FormatedTime = (datetime.now() - day).strftime('%Y-%m-%d')
  90. FormatedTime = re.findall(r'\d+', str(FormatedTime))
  91. FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
  92. # return datetime.strptime(FormatedTime, '%Y-%m-%d')
  93. return FormatedTime
  94. elif ('周前' in timess) or ('週前' in timess) or ('week' in timess) or (
  95. '週' in timess):
  96. try:
  97. week = timess[:timess.find('周前')]
  98. week = timedelta(weeks=int(week))
  99. except:
  100. week = timess[:timess.find('週')]
  101. week = timedelta(weeks=int(week))
  102. FormatedTime = (datetime.now() - week).strftime('%Y-%m-%d')
  103. FormatedTime = re.findall(r'\d+', str(FormatedTime))
  104. FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
  105. # return datetime.strptime(FormatedTime, '%Y-%m-%d')
  106. return FormatedTime
  107. elif ('个月前' in timess) or ('個月前' in timess) or ('month' in timess):
  108. month = timess[:timess.find('个月前')]
  109. month = timedelta(days=int(month) * 30)
  110. FormatedTime = (datetime.now() - month).strftime('%Y-%m-%d')
  111. FormatedTime = re.findall(r'\d+', str(FormatedTime))
  112. FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
  113. # return datetime.strptime(FormatedTime, '%Y-%m-%d')
  114. return FormatedTime
  115. else:
  116. try:
  117. FormatedTime = re.findall(r'\d+', str(timess))
  118. FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
  119. # FormatedTime = datetime.strptime(FormatedTime, '%Y-%m-%d')
  120. return FormatedTime
  121. except Exception as e_time:
  122. print(e_time)
  123. return timess
  124. FormatedTime = re.findall(r'\d+', str(FormatedTime))
  125. # FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2] + ' ' + FormatedTime[
  126. # 3] + ':' + FormatedTime[4]
  127. FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
  128. # return datetime.strptime(FormatedTime, '%Y-%m-%d %H:%M')
  129. return FormatedTime
  130. # 点击文章
  131. div.find_element(by=By.XPATH, value='./div/div/div/a').click()
  132. time.sleep(2)
  133. # 切换到文章页面去获取全文
  134. toutiao.switch_to.window(toutiao.window_handles[1])
  135. time.sleep(2)
  136. page_detail = toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div[2]/div[2]/div[1]/div/article').text
  137. # 替换非法字符
  138. page_detail = ILLEGAL_CHARACTERS_RE.sub(r'', page_detail)
  139. # print('原文内容:', page_detail)
  140. time.sleep(2)
  141. # 关闭当前窗口
  142. toutiao.close()
  143. # 切换为原来的窗口
  144. toutiao.switch_to.window(toutiao.window_handles[0])
  145. time.sleep(2)
  146. record = {
  147. '标题': title_name,
  148. '阅读数': read,
  149. '评论数': comment,
  150. '发布时间': parseTime(timess),
  151. '原文内容': page_detail
  152. }
  153. date_list.append(record)
  154. try:
  155. pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8') # 保存数据
  156. except openpyxl.utils.exceptions.IllegalCharacterError:
  157. print('出现非法字符')
  158. else:
  159. pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8') # 保存数据
  160. print('over!!!')

6.成果展示

7.第一次写爬虫,有许多不足之处,欢迎各位大佬指导。 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/酷酷是懒虫/article/detail/958691
推荐阅读
相关标签
  

闽ICP备14008679号