赞
踩
1.先去部署selenium+Chrome,具体部署方法,可以去搜索一下,很简单。
2.找到你要爬取的头条号网址,本文以光明网为例。
3.根据网页数据去分析获取xpath路径
4.根据xpath路径,爬取文章内容,点赞数,文章内容等数据
5.代码实现(有注释)
- # 开发人员:小李同学
- # 开发日期:2022/7/11 9:07
-
- import openpyxl
-
- from selenium.webdriver import Chrome
- import pandas as pd
- from selenium.webdriver.common.by import By
- import time
- from selenium.webdriver.chrome.options import Options
- import re
- from datetime import datetime, timedelta
- ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
-
- # 要爬取头条号的网址
- net = input('请输入要访问的头条号的网址:')
- num_1 = int(input('请输入滚动的次数(次数越多爬取数据越多,耗时越久):'))
- file_name = input('请输入文件名:')
- url = net
- opt = Options()
- # 删除掉Chrome浏览器正在收到自动测试软件的控制
- opt.add_experimental_option('excludeSwitches', ['enable-automation'])
- # 创建浏览器对象
- toutiao = Chrome(options=opt)
- toutiao.get(url)
- time.sleep(2)
- # 点击文章
- toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[1]/ul/li[2]').click()
- time.sleep(2)
-
- # 滚动条向下滑动
- count = 0
- while count < num_1:
- toutiao.execute_script('window.scrollTo(0, document.body.scrollHeight)')
- time.sleep(1)
- count += 1
-
- time.sleep(2)
- # 滑动回顶部,否则爬取不到文章内容
- js = "window.scrollTo(0,0)"
- toutiao.execute_script(js)
- time.sleep(1)
- # 所有文章所在的div
- div_list = toutiao.find_elements(by=By.XPATH, value='//*[@id="root"]/div/div[3]/div[1]/div/div[2]/div/div/div')
- time.sleep(2)
- date_list = []
- # print(div_list)
- for div in div_list:
- # 标题
- title_name = div.find_element(by=By.XPATH, value='./div/div/div/a').text
- # 阅读数
- read = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[1]').text
- # 评论数
- comment = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[2]').text
- # 发布时间
- timess = div.find_element(by=By.XPATH, value='./div/div/div/div/div[1]/div[3]').text
- """ print('标题:', title_name)
- print('阅读数:', read)
- print('评论数:', comment)
- print('发布时间:', timess)"""
- time.sleep(2)
- def parseTime(timess):
- if ('分钟前' in timess) or ('分鐘前' in timess) or ('minute' in timess) or (
- '分鐘' in timess) or ('分前' in timess):
- try:
- minutes = timess[:timess.find('分鐘')]
- minutes = timedelta(minutes=int(minutes))
- except:
- minutes = timess[:timess.find('分钟前')]
- minutes = timedelta(minutes=int(minutes))
- # FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d %H:%M')
- FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
- elif ('mins ago') in timess:
- minutes = timess[:timess.find('mins ago')]
- minutes = timedelta(minutes=int(minutes))
- FormatedTime = (datetime.now() - minutes).strftime('%Y-%m-%d')
- elif ('hours ag') in timess:
- hour = timess[:timess.find('hours ag')]
- hour = timedelta(hours=int(hour))
- FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
- elif ('小时前' in timess) or ('小時前' in timess) or ('hour' in timess) or (
- '小時' in timess):
- try:
- hour = timess[:timess.find('小时前')]
- hour = timedelta(hours=int(hour))
- except:
- hour = timess[:timess.find('小時')]
- hour = timedelta(hours=int(hour))
- # FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d %H:%M')
- FormatedTime = (datetime.now() - hour).strftime('%Y-%m-%d')
- elif ('天前' in timess) or ('day' in timess):
- day = timess[:timess.find('天前')]
- day = timedelta(days=int(day))
- FormatedTime = (datetime.now() - day).strftime('%Y-%m-%d')
- FormatedTime = re.findall(r'\d+', str(FormatedTime))
- FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
- # return datetime.strptime(FormatedTime, '%Y-%m-%d')
- return FormatedTime
- elif ('周前' in timess) or ('週前' in timess) or ('week' in timess) or (
- '週' in timess):
- try:
- week = timess[:timess.find('周前')]
- week = timedelta(weeks=int(week))
- except:
- week = timess[:timess.find('週')]
- week = timedelta(weeks=int(week))
- FormatedTime = (datetime.now() - week).strftime('%Y-%m-%d')
- FormatedTime = re.findall(r'\d+', str(FormatedTime))
- FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
- # return datetime.strptime(FormatedTime, '%Y-%m-%d')
- return FormatedTime
- elif ('个月前' in timess) or ('個月前' in timess) or ('month' in timess):
- month = timess[:timess.find('个月前')]
- month = timedelta(days=int(month) * 30)
- FormatedTime = (datetime.now() - month).strftime('%Y-%m-%d')
- FormatedTime = re.findall(r'\d+', str(FormatedTime))
- FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
- # return datetime.strptime(FormatedTime, '%Y-%m-%d')
- return FormatedTime
- else:
- try:
- FormatedTime = re.findall(r'\d+', str(timess))
- FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
- # FormatedTime = datetime.strptime(FormatedTime, '%Y-%m-%d')
- return FormatedTime
- except Exception as e_time:
- print(e_time)
- return timess
- FormatedTime = re.findall(r'\d+', str(FormatedTime))
- # FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2] + ' ' + FormatedTime[
- # 3] + ':' + FormatedTime[4]
- FormatedTime = FormatedTime[0] + '-' + FormatedTime[1] + '-' + FormatedTime[2]
- # return datetime.strptime(FormatedTime, '%Y-%m-%d %H:%M')
- return FormatedTime
- # 点击文章
- div.find_element(by=By.XPATH, value='./div/div/div/a').click()
- time.sleep(2)
- # 切换到文章页面去获取全文
- toutiao.switch_to.window(toutiao.window_handles[1])
- time.sleep(2)
- page_detail = toutiao.find_element(by=By.XPATH, value='//*[@id="root"]/div[2]/div[2]/div[1]/div/article').text
- # 替换非法字符
- page_detail = ILLEGAL_CHARACTERS_RE.sub(r'', page_detail)
- # print('原文内容:', page_detail)
- time.sleep(2)
- # 关闭当前窗口
- toutiao.close()
- # 切换为原来的窗口
- toutiao.switch_to.window(toutiao.window_handles[0])
- time.sleep(2)
- record = {
- '标题': title_name,
- '阅读数': read,
- '评论数': comment,
- '发布时间': parseTime(timess),
- '原文内容': page_detail
- }
- date_list.append(record)
- try:
- pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8') # 保存数据
- except openpyxl.utils.exceptions.IllegalCharacterError:
- print('出现非法字符')
- else:
- pd.DataFrame(date_list).to_excel(file_name + '.xlsx', index=False, encoding='utf-8') # 保存数据
-
- print('over!!!')
6.成果展示
7.第一次写爬虫,有许多不足之处,欢迎各位大佬指导。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。