赞
踩
一、设置属性
import requests from random import choice from threading import Thread from queue import Queue from bs4 import BeautifulSoup import csv import threading class NoProxiesError(Exception): pass class NetThread(Thread): def __init__(self, url, type, queue): super().__init__() self.url = url self.type = type self.queue = queue def run(self): new_get_net_data(self.url, self.queue) def update_proxies_pool(): """通过蘑菇代理获取代理服务器地址和端口,构建IP代理池""" proxies_pool = [] resp = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs' '?appKey=4338998cd0824d9d9d75f8905bd687ba&count=5&' 'expiryDate=0&format=1&newLine=2') if resp.status_code == 200: result = resp.json() if result['code'] == '0': for item in result['msg']: ip, port = item['ip'], item['port'] # proxies_pool.append(f'http://{ip}:{port}') proxies_pool.append({'http': f'{ip}:{port}'}) return proxies_pool raise NoProxiesError('获取代理服务器信息失败,请重试!!!') # ====旧的=== def get_proxies(): return choice(update_proxies_pool()) def new_get_proxies(): while True: try: proxies = update_proxies_pool() return choice(proxies) except: print('iP获取异常') continue proxy = new_get_proxies() def new_get_net_data(url, queue): global proxy # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } # 代理 try: response = requests.get(url, headers=headers, proxies=proxy) response.encoding = 'gbk' # print(response.text) analysis_data(response.text, queue) except requests.RequestException: print('请求失败!') # ==================方案一:把所有分类的所有页的数据保存到一个文件中================ # proxy = get_proxies() # 请求每一个页面的数据 def get_net_data(url, queue): global proxy # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } # 代理 try: response = requests.get(url, headers=headers, proxies=proxy) response.encoding = 'gbk' # print(response.text) analysis_data(response.text, queue) except requests.RequestException: print('请求失败!') except NoProxiesError: print('代理错误!') proxy = get_proxies() get_net_data(url, queue) # 数据解析 def analysis_data(data, queue: Queue): currentThread = threading.current_thread() # print('类型:', currentThread.type) doc = BeautifulSoup(data, 'lxml') ul = doc.select('.seeWell.cf>li') for li in ul: li_doc = BeautifulSoup(str(li), 'lxml') image_url = li_doc.img.attrs['src'] name = li_doc.img.attrs['alt'] au_name = li_doc.select('span.l>a:nth-child(2)')[0].get_text() # print([name, au_name, image_url]) # 方案一: # queue.put([name, au_name, image_url]) # 方案二: queue.put({currentThread.type: [name, au_name, image_url]}) # 创建线程对象获取每个页面的数据 def get_all_data(): queue = Queue() t_list = [] for type in range(1,5): for page in range(1, 3): url = f'http://www.quanshuwang.com/list/{type}_{page}.html' # =====方案一===== # t = Thread(target=get_net_data, args=(url, queue)) # ======方案二====== t = NetThread(url, type, queue) t.start() t_list.append(t) wait_t = Thread(target=new_write_data, args=(t_list, queue)) wait_t.start() # 等待所有线程结束然后保存数据 def write_data(t_list, queue: Queue): for t in t_list: t.join() queue.put('end') all_data = [] while True: data = queue.get() if data == 'end': break else: all_data.append(data) with open('files/所有的小说数据.csv', 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(['名字', '作者', '封面']) writer.writerows(all_data) print('完成!') def new_write_data(t_list, queue: Queue): for t in t_list: t.join() queue.put('end') all_data = { 1: [], 2: [], 3: [], 4: [] } while True: data = queue.get() if data == 'end': break key = list(data.keys())[0] all_data[key].append(data[key]) for ty in all_data: with open(f'files/类型{ty}.csv', 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(['名称', '作者', '图片']) writer.writerows(all_data[ty]) if __name__ == '__main__': get_all_data() # html = """ # <li><a class="l mr10" href="http://www.quanshuwang.com/book_181326.html" target="_blank"><img alt="攻略极品" height="150" οnerrοr="this.src='/modules/article/images/nocover.jpg'" src="http://www.quanshuwang.com/modules/article/images/nocover.jpg" width="120"/></a><img class="topss png_bg" src="/kukuku/images/only2.png"/><span class="l"><a class="clearfix stitle" href="http://www.quanshuwang.com/book_181326.html" target="_blank" title="攻略极品">攻略极品</a>作者:<a href="/modules/article/authorarticle.php?author=%C8%F8%C1%D5%C4%C8">萨琳娜</a><em class="c999 clearfix"> 斗极品? # 不! # 我们的口号是:走极品的路,让极品...<a href="http://www.quanshuwang.com/book_181326.html">更多</a></em><a class="readTo" href="http://www.quanshuwang.com/book_181326.html">马上阅读</a></span></li> # """ # s = BeautifulSoup(html, 'lxml') # print(s.img) # print(s.select('span.l>a:nth-child(2)')) # q = Queue() # q.put(100) # q.put(200) # q.put('end') # # while True: # data = q.get() # print(data) # if data == 'end': # break
二、selenium的使用
import time # 1. 基本使用 # from selenium import webdriver # # # 创建浏览器 # browser = webdriver.Chrome() # # 打开指定页面 # browser.get('https://www.baidu.com') # time.sleep(5) # browser.close() # 2.配置浏览器 # from selenium import webdriver # # 创建配置对象 # options = webdriver.ChromeOptions() # # 1)不具备自动化测试工具的身份 # options.add_experimental_option('excludeSwitches', ['enable-automation']) # # 2)取消图片加载 # options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2}) # # 创建浏览器对象 # browser = webdriver.Chrome(options=options) # browser.get('https://www.baidu.com') # time.sleep(5) # browser.close() # 3.基本操作 # from selenium import webdriver # from selenium.webdriver.common import keys # # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.common.by import By # from selenium.webdriver.support import expected_conditions as EC # # browser = webdriver.Chrome() # browser.get('https://www.baidu.com') # # 获取标签 # search_input = browser.find_element_by_id('kw') # print(search_input) # # 操作标签 # search_input.send_keys('帅哥') # search_input.send_keys(keys.Keys.ENTER) # # 等待操作 # wait = WebDriverWait(browser, 10) # wait.until(EC.presence_of_element_located((By.ID, 'head'))) # # 获取相关信息 # # print(browser.current_url) # # print(browser.page_source) # print(browser.get_cookies()) # time.sleep(20) # browser.close() # 4. 简单的交互动作 # from selenium import webdriver # browser = webdriver.Chrome() # browser.get('https://www.jd.com') # input = browser.find_element_by_id('key') # button = browser.find_element_by_css_selector('#search > div > div.form > button') # # 输入框输入内容 # input.send_keys('美食') # # 点击事件 # button.click() # time.sleep(10) # browser.close() # 5.动作链 # from selenium import webdriver # from selenium.webdriver import ActionChains # # browser = webdriver.Chrome() # url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' # browser.get(url) # browser.switch_to.frame('iframeResult') # source = browser.find_element_by_css_selector('#draggable') # target = browser.find_element_by_css_selector('#droppable') # # 创建动作连对象 # actions = ActionChains(browser) # # actions.drag_and_drop(source, target) # actions.drag_and_drop_by_offset(source, 0, 200) # actions.perform() # 开始执行动作链的动作 # # # time.sleep(25) # browser.close() # 6.执行javascript代码 # from selenium import webdriver # # browser = webdriver.Chrome() # browser.get('https://www.jd.com') # body = browser.find_element_by_css_selector('body') # print(body.size) # time.sleep(1) # browser.execute_script('window.scrollBy(0, 4474)') # # browser.execute_script('alert("底部")') # time.sleep(2) # print(body.size) # # time.sleep(10) # browser.close() # 7.前进和后退 # import time # from selenium import webdriver # browser = webdriver.Chrome() # browser.get('https://www.baidu.com/') # browser.get('https://www.taobao.com/') # browser.get('https://www.jd.com/') # browser.back() # time.sleep(1) # browser.forward() # browser.close() # 8.选项卡 from selenium import webdriver import time browser = webdriver.Chrome() browser.get('https://www.baidu.com') browser.execute_script('window.open()') # 浏览器对象.window_handles - 获取当前浏览器中所有的选项卡 print(browser.window_handles) # 切换选项卡 browser.switch_to.window(browser.window_handles[1]) browser.get('https://taobao.com') time.sleep(1) browser.switch_to.window(browser.window_handles[0]) time.sleep(10) browser.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。