赞
踩
以成都地铁官微为例,爬虫爬取成都地铁官微的粉丝数量:
- # pyautogui通过模板匹配,找到新浪微博账户显示粉丝数量的位置,
- # 然后双击使得数字处于选中状态,再用pyperclip获取粘贴板数字。
- import time
- import pandas as pd
- from tqdm import tqdm
- import pyautogui
- import webbrowser as wb
- from selenium import webdriver
- import pyperclip
-
-
- # 启动程序前先打开浏览器,且使浏览器窗口最大化。
- def get_city_data(city):
- url = f'https://weibo.com/{city[1]}'
- # chromepath = r'D:\program\chromedriver_win32\chromedriver.exe'
- # driver = webdriver.Chrome(executable_path=chromepath)
- # driver.get(url)
- # driver.close()
- # driver.quit()
- wb.open(url=url)
-
- pyautogui.sleep(10)
-
- # loc.png需要事先打开微博截取,作为目标的模板匹配图片。
- # 不同机器不同分辨率loc.png尺寸大小不同
- locate = pyautogui.locateOnScreen('loc.png')
- center_x, center_y = pyautogui.center(locate)
- HIGHT = 38
- pyautogui.moveTo(center_x, center_y - HIGHT)
- pyautogui.sleep(1)
- pyautogui.doubleClick()
- pyautogui.sleep(1)
- pyautogui.hotkey('ctrl', 'c')
- num = pyperclip.paste()
-
- return city[0], int(num), time.strftime('%Y-%m-%d %H:%M', time.localtime())
-
-
- def main():
- city = [
- ('成都', '2384889627'),
- ]
-
- city_data = []
- pbar = tqdm(total=len(city), leave=True)
- for c in city:
- result = get_city_data(c)
- print(result)
- city_data.append(list(result))
-
- pbar.update(1)
-
- col = ['城市', '粉丝数量', '统计时间']
- df = pd.DataFrame(data=city_data, columns=col)
- df = df.sort_values(by=col[1], axis=0, ascending=False) # 降序
-
- # 排序后重置index,
- # 否则索引是混乱的
- df = df.reset_index(drop=True)
-
- # 因为默认的pandas起始索引从0开始,
- # 为了使数据行的初始索引(起始索引index)从1开始
- df.index = df.index + 1
-
- print(df.head(10))
-
- df.to_excel('city.xls', encoding='utf-8')
- df.to_csv('city.csv', encoding='utf-8')
-
-
- if __name__ == '__main__':
- main()
输出:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。