当前位置:   article > 正文

代理和selenium的使用——python_selenium ss代理

selenium ss代理

一、设置属性

import requests
from random import choice
from threading import Thread
from queue import Queue
from bs4 import BeautifulSoup
import csv
import threading


class NoProxiesError(Exception):
    pass


class NetThread(Thread):
    def __init__(self, url, type, queue):
        super().__init__()
        self.url = url
        self.type = type
        self.queue = queue

    def run(self):
        new_get_net_data(self.url, self.queue)


def update_proxies_pool():
    """通过蘑菇代理获取代理服务器地址和端口,构建IP代理池"""
    proxies_pool = []
    resp = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs'
                        '?appKey=4338998cd0824d9d9d75f8905bd687ba&count=5&'
                        'expiryDate=0&format=1&newLine=2')
    if resp.status_code == 200:
        result = resp.json()
        if result['code'] == '0':
            for item in result['msg']:
                ip, port = item['ip'], item['port']
                # proxies_pool.append(f'http://{ip}:{port}')
                proxies_pool.append({'http': f'{ip}:{port}'})
            return proxies_pool
    raise NoProxiesError('获取代理服务器信息失败,请重试!!!')


# ====旧的===
def get_proxies():
    return choice(update_proxies_pool())


def new_get_proxies():
    while True:
        try:
            proxies = update_proxies_pool()
            return choice(proxies)
        except:
            print('iP获取异常')
            continue


proxy = new_get_proxies()


def new_get_net_data(url, queue):
    global proxy
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    # 代理
    try:
        response = requests.get(url, headers=headers, proxies=proxy)
        response.encoding = 'gbk'
        # print(response.text)
        analysis_data(response.text, queue)
    except requests.RequestException:
        print('请求失败!')



# ==================方案一:把所有分类的所有页的数据保存到一个文件中================
# proxy = get_proxies()
# 请求每一个页面的数据
def get_net_data(url, queue):
    global proxy
    # 请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    # 代理
    try:
        response = requests.get(url, headers=headers, proxies=proxy)
        response.encoding = 'gbk'
        # print(response.text)
        analysis_data(response.text, queue)
    except requests.RequestException:
        print('请求失败!')
    except NoProxiesError:
        print('代理错误!')
        proxy = get_proxies()
        get_net_data(url, queue)


# 数据解析
def analysis_data(data, queue: Queue):
    currentThread = threading.current_thread()
    # print('类型:', currentThread.type)
    doc = BeautifulSoup(data, 'lxml')
    ul = doc.select('.seeWell.cf>li')
    for li in ul:
        li_doc = BeautifulSoup(str(li), 'lxml')
        image_url = li_doc.img.attrs['src']
        name = li_doc.img.attrs['alt']
        au_name = li_doc.select('span.l>a:nth-child(2)')[0].get_text()
        # print([name, au_name, image_url])
        # 方案一:
        # queue.put([name, au_name, image_url])
        # 方案二:
        queue.put({currentThread.type: [name, au_name, image_url]})


# 创建线程对象获取每个页面的数据
def get_all_data():
    queue = Queue()
    t_list = []
    for type in range(1,5):
        for page in range(1, 3):
            url = f'http://www.quanshuwang.com/list/{type}_{page}.html'
            # =====方案一=====
            # t = Thread(target=get_net_data, args=(url, queue))
            # ======方案二======
            t = NetThread(url, type, queue)
            t.start()
            t_list.append(t)

    wait_t = Thread(target=new_write_data, args=(t_list, queue))
    wait_t.start()


# 等待所有线程结束然后保存数据
def write_data(t_list, queue: Queue):
    for t in t_list:
        t.join()
    queue.put('end')

    all_data = []
    while True:
        data = queue.get()
        if data == 'end':
            break
        else:
            all_data.append(data)

    with open('files/所有的小说数据.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['名字', '作者', '封面'])
        writer.writerows(all_data)
    print('完成!')


def new_write_data(t_list, queue: Queue):
    for t in t_list:
        t.join()
    queue.put('end')

    all_data = {
        1: [],
        2: [],
        3: [],
        4: []
    }

    while True:
        data = queue.get()
        if data == 'end':
            break

        key = list(data.keys())[0]
        all_data[key].append(data[key])

    for ty in all_data:
        with open(f'files/类型{ty}.csv', 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['名称', '作者', '图片'])
            writer.writerows(all_data[ty])


if __name__ == '__main__':
    get_all_data()

# html = """
# <li><a class="l mr10" href="http://www.quanshuwang.com/book_181326.html" target="_blank"><img alt="攻略极品" height="150" οnerrοr="this.src='/modules/article/images/nocover.jpg'" src="http://www.quanshuwang.com/modules/article/images/nocover.jpg" width="120"/></a><img class="topss png_bg" src="/kukuku/images/only2.png"/><span class="l"><a class="clearfix stitle" href="http://www.quanshuwang.com/book_181326.html" target="_blank" title="攻略极品">攻略极品</a>作者:<a href="/modules/article/authorarticle.php?author=%C8%F8%C1%D5%C4%C8">萨琳娜</a><em class="c999 clearfix">    斗极品?
#     不!
#     我们的口号是:走极品的路,让极品...<a href="http://www.quanshuwang.com/book_181326.html">更多</a></em><a class="readTo" href="http://www.quanshuwang.com/book_181326.html">马上阅读</a></span></li>
# """
# s = BeautifulSoup(html, 'lxml')
# print(s.img)
# print(s.select('span.l>a:nth-child(2)'))

# q = Queue()
# q.put(100)
# q.put(200)
# q.put('end')
#
# while True:
#     data = q.get()
#     print(data)
#     if data == 'end':
#         break

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197
  • 198
  • 199
  • 200
  • 201
  • 202
  • 203
  • 204
  • 205
  • 206

二、selenium的使用

import time

# 1. 基本使用
# from selenium import webdriver
#
# # 创建浏览器
# browser = webdriver.Chrome()
# # 打开指定页面
# browser.get('https://www.baidu.com')
# time.sleep(5)
# browser.close()

# 2.配置浏览器
# from selenium import webdriver
# # 创建配置对象
# options = webdriver.ChromeOptions()
# # 1)不具备自动化测试工具的身份
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# # 2)取消图片加载
# options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
# # 创建浏览器对象
# browser = webdriver.Chrome(options=options)
# browser.get('https://www.baidu.com')
# time.sleep(5)
# browser.close()

# 3.基本操作
# from selenium import webdriver
# from selenium.webdriver.common import keys
#
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support import expected_conditions as EC
#
# browser = webdriver.Chrome()
# browser.get('https://www.baidu.com')
# # 获取标签
# search_input = browser.find_element_by_id('kw')
# print(search_input)
# # 操作标签
# search_input.send_keys('帅哥')
# search_input.send_keys(keys.Keys.ENTER)
# # 等待操作
# wait = WebDriverWait(browser, 10)
# wait.until(EC.presence_of_element_located((By.ID, 'head')))
# # 获取相关信息
# # print(browser.current_url)
# # print(browser.page_source)
# print(browser.get_cookies())
# time.sleep(20)
# browser.close()

# 4. 简单的交互动作
# from selenium import webdriver
# browser = webdriver.Chrome()
# browser.get('https://www.jd.com')
# input = browser.find_element_by_id('key')
# button = browser.find_element_by_css_selector('#search > div > div.form > button')
# # 输入框输入内容
# input.send_keys('美食')
# # 点击事件
# button.click()
# time.sleep(10)
# browser.close()

# 5.动作链
# from selenium import webdriver
# from selenium.webdriver import ActionChains
#
# browser = webdriver.Chrome()
# url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
# browser.get(url)
# browser.switch_to.frame('iframeResult')
# source = browser.find_element_by_css_selector('#draggable')
# target = browser.find_element_by_css_selector('#droppable')
# # 创建动作连对象
# actions = ActionChains(browser)
# # actions.drag_and_drop(source, target)
# actions.drag_and_drop_by_offset(source, 0, 200)
# actions.perform()    # 开始执行动作链的动作
#
#
# time.sleep(25)
# browser.close()

# 6.执行javascript代码
# from selenium import webdriver
#
# browser = webdriver.Chrome()
# browser.get('https://www.jd.com')
# body = browser.find_element_by_css_selector('body')
# print(body.size)
# time.sleep(1)
# browser.execute_script('window.scrollBy(0, 4474)')
# # browser.execute_script('alert("底部")')
# time.sleep(2)
# print(body.size)
# # time.sleep(10)
# browser.close()

# 7.前进和后退
# import time
# from selenium import webdriver
# browser = webdriver.Chrome()
# browser.get('https://www.baidu.com/')
# browser.get('https://www.taobao.com/')
# browser.get('https://www.jd.com/')
# browser.back()
# time.sleep(1)
# browser.forward()
# browser.close()

# 8.选项卡
from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
# 浏览器对象.window_handles  -  获取当前浏览器中所有的选项卡
print(browser.window_handles)
# 切换选项卡
browser.switch_to.window(browser.window_handles[1])
browser.get('https://taobao.com')
time.sleep(1)
browser.switch_to.window(browser.window_handles[0])


time.sleep(10)
browser.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
声明:本文内容由网友自发贡献,转载请注明出处:【wpsshop】
推荐阅读
相关标签
  

闽ICP备14008679号