赞
踩
声明:此方法禁止进行违法用途,否则后果自负
1.环境配置
(1)python 3.7
(2)使用pip 安装selenium
(3)下载Chrome浏览器,并下载对应版本chromedriver
2.代码
import os import re import time import datetime from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By def scroll_foot(browser): js = 'var q=document.documentElement.scrollTop=100000' return browser.execute_script(js) def get_qq(browser,qq_file): trs = browser.find_elements_by_class_name('mb') if trs: for i,tr in enumerate(trs): tds = tr.find_elements_by_tag_name('td')[2:] qq = tds[2].text qq_file.writelines(qq+'\n') return i def extractor(browser,qq_list_path): current_len = 0 while current_len < len(browser.page_source): current_len = len(browser.page_source) scroll_foot(browser) time.sleep(1.0) qq_list_file = open(qq_list_path,'w+') member_num = get_qq(browser,qq_list_file) qq_list_file.close() return member_num def login_spider(exe_path, url): browser = webdriver.Chrome(exe_path) # 请求url browser.get(url) # 模拟登陆,首先找到登陆的id,并点击 browser.find_element_by_css_selector('#headerInfo p a').click() WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, '#loginWin iframe') ) ) print('登陆框已加载') iframe_url = browser.find_element_by_css_selector('#loginWin iframe').get_attribute('src') # 再访问这个url browser.get(iframe_url) # 找到快捷登陆的头像并点击 # 首先用显示等待这个头像已经加载完成 WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located( (By.ID, 'qlogin_list') ) ) browser.find_element_by_css_selector('#qlogin_list a').click() print('登陆成功') return browser def switch_spider(browser): # 登陆成功之后,我们就找到群管理的标签并点击,首先等待这个元素加载完成 WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located( (By.XPATH, './/ul[@id="headerNav"]/li[4]') ) ) browser.find_element_by_xpath('.//ul[@id="headerNav"]/li[4]').click() # 点击之后,我们找到成员管理标签并点击 WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located( (By.CLASS_NAME, 'color-tit') ) ) browser.find_element_by_class_name('color-tit').click() browser.switch_to.window(browser.window_handles[1]) return browser def start_spider(browser,dir): WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located( (By.CLASS_NAME, 'my-all-group') ) ) # 筛选出我加入的群标签 lis = browser.find_elements_by_xpath('.//div[@class="my-all-group"]/ul[1]/li') group_num = len(lis) for idx in range(group_num): try: lis[idx].click() name_and_id = browser.find_element_by_id('groupTit').text name = name_and_id.split('(')[0] id = name_and_id.split('(')[1].split(')')[0] qq_list_path = dir + '/' + name + '_' + id print('开始提取[' + name_and_id + ']') member_num = extractor(browser,qq_list_path) print('提取[' + name_and_id + ']成功:' + str(member_num) + '人') browser.find_element_by_id('changeGroup').click() WebDriverWait(browser, 10).until( EC.presence_of_all_elements_located( (By.CLASS_NAME, 'ui-dialog') ) ) lis = browser.find_elements_by_xpath('.//div[@class="my-all-group"]/ul[1]/li') except Exception as e: continue def combine(dir,file_name): qq_list = [] dest_path = dir + '/' + file_name dest_file = open(dest_path, 'w+') list = os.listdir(dir) # 列出文件夹下所有的目录与文件 re_count = 0 for file_name in list: path = os.path.join(dir, file_name) for line in open(path, 'r'): line = line.strip().split()[0] if line not in qq_list: qq_list.append(line) dest_file.write(line + '\n') else: re_count += 1 dest_file.close() print('合并成功,共' + str(len(qq_list)) + '人,去除重复' + str(re_count)) if __name__ =='__main__': url = 'https://qun.qq.com/' exe_path = 'C:/attachment/chromedriver.exe' # 构建谷歌驱动器 now = datetime.datetime.today().strftime("%Y%m%d") dir = 'dataset_' + now file_name = 'all_qq.txt' try: ## 创建一个文件夹,用于存放数据集。文件夹命令方式:dataset + yyyymmdd(本日日期) os.mkdir(dir) except: ## 如果文件夹已存在,则放弃创建 pass browser = login_spider(exe_path, url) switch_spider(browser) start_spider(browser,dir) browser.quit() combine(dir, file_name)
参考:
[1]Python爬虫使用selenium爬取qq群的成员信息(全自动实现自动登陆)[博客园]
[2] 教你用python爬取自己加入的QQ群成员名单 [知乎]
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。