当前位置:   article > 正文

使用python+selenium批量提取群成员QQ_selenium 违法吗

selenium 违法吗

声明:此方法禁止进行违法用途,否则后果自负

1.环境配置

(1)python 3.7
(2)使用pip 安装selenium
(3)下载Chrome浏览器,并下载对应版本chromedriver

2.代码

import os
import re
import time
import datetime

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def scroll_foot(browser):
    js = 'var q=document.documentElement.scrollTop=100000'
    return browser.execute_script(js)

def get_qq(browser,qq_file):
    trs = browser.find_elements_by_class_name('mb')

    if trs:
        for i,tr in enumerate(trs):
            tds = tr.find_elements_by_tag_name('td')[2:]
            qq = tds[2].text
            qq_file.writelines(qq+'\n')
    return i

def extractor(browser,qq_list_path):
    current_len = 0
    while current_len < len(browser.page_source):
        current_len = len(browser.page_source)
        scroll_foot(browser)
        time.sleep(1.0)
    qq_list_file = open(qq_list_path,'w+')
    member_num = get_qq(browser,qq_list_file)
    qq_list_file.close()
    return member_num


def login_spider(exe_path, url):
    browser = webdriver.Chrome(exe_path)
    # 请求url
    browser.get(url)
    # 模拟登陆,首先找到登陆的id,并点击
    browser.find_element_by_css_selector('#headerInfo p a').click()
    WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, '#loginWin iframe')
        )
    )
    print('登陆框已加载')
    iframe_url = browser.find_element_by_css_selector('#loginWin iframe').get_attribute('src')
    # 再访问这个url
    browser.get(iframe_url)
    # 找到快捷登陆的头像并点击
    # 首先用显示等待这个头像已经加载完成
    WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located(
            (By.ID, 'qlogin_list')
        )
    )
    browser.find_element_by_css_selector('#qlogin_list a').click()
    print('登陆成功')
    return browser
def switch_spider(browser):
    # 登陆成功之后,我们就找到群管理的标签并点击,首先等待这个元素加载完成
    WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located(
            (By.XPATH, './/ul[@id="headerNav"]/li[4]')
        )
    )
    browser.find_element_by_xpath('.//ul[@id="headerNav"]/li[4]').click()
    # 点击之后,我们找到成员管理标签并点击
    WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located(
            (By.CLASS_NAME, 'color-tit')
        )
    )
    browser.find_element_by_class_name('color-tit').click()
    browser.switch_to.window(browser.window_handles[1])
    return browser

def start_spider(browser,dir):
    WebDriverWait(browser, 10).until(
        EC.presence_of_all_elements_located(
            (By.CLASS_NAME, 'my-all-group')
        )
    )
    # 筛选出我加入的群标签
    lis = browser.find_elements_by_xpath('.//div[@class="my-all-group"]/ul[1]/li')
    group_num = len(lis)
    for idx in range(group_num):
        try:
            lis[idx].click()
            name_and_id = browser.find_element_by_id('groupTit').text
            name = name_and_id.split('(')[0]
            id = name_and_id.split('(')[1].split(')')[0]
            qq_list_path = dir + '/' + name + '_' + id
            print('开始提取[' + name_and_id + ']')
            member_num = extractor(browser,qq_list_path)
            print('提取[' + name_and_id + ']成功:' + str(member_num) + '人')
            browser.find_element_by_id('changeGroup').click()
            WebDriverWait(browser, 10).until(
                EC.presence_of_all_elements_located(
                    (By.CLASS_NAME, 'ui-dialog')
                )
            )
            lis = browser.find_elements_by_xpath('.//div[@class="my-all-group"]/ul[1]/li')

        except Exception as e:
            continue

def combine(dir,file_name):
    qq_list = []
    dest_path = dir + '/' + file_name
    dest_file = open(dest_path, 'w+')
    list = os.listdir(dir)  # 列出文件夹下所有的目录与文件
    re_count = 0
    for file_name in list:
        path = os.path.join(dir, file_name)
        for line in open(path, 'r'):
            line = line.strip().split()[0]
            if line not in qq_list:
                qq_list.append(line)
                dest_file.write(line + '\n')
            else:
                re_count += 1
    dest_file.close()
    print('合并成功,共' + str(len(qq_list)) + '人,去除重复' + str(re_count))

if __name__ =='__main__':
    url = 'https://qun.qq.com/'
    exe_path = 'C:/attachment/chromedriver.exe'
    # 构建谷歌驱动器
    now = datetime.datetime.today().strftime("%Y%m%d")
    dir = 'dataset_' + now
    file_name = 'all_qq.txt'
    try:  ## 创建一个文件夹,用于存放数据集。文件夹命令方式:dataset + yyyymmdd(本日日期)
        os.mkdir(dir)
    except:  ## 如果文件夹已存在,则放弃创建
        pass
    browser = login_spider(exe_path, url)
    switch_spider(browser)
    start_spider(browser,dir)
    browser.quit()
    combine(dir, file_name)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143

参考:
[1]Python爬虫使用selenium爬取qq群的成员信息(全自动实现自动登陆)[博客园]
[2] 教你用python爬取自己加入的QQ群成员名单 [知乎]

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/339662
推荐阅读
相关标签
  

闽ICP备14008679号