赞
踩
第一章:selenium实现爬虫功能selenium爬取图片实例
第二章:selenium实现增量式爬虫功能增量式爬虫
第三章:selenium搜索关键字爬虫
自上次爬取完了4K美女的图片之后,发现动漫栏里边也有很多好看的的美女壁纸,但是如果直接爬取整个动漫栏则会出现很多多余的图片,所以这次采用了搜索关键字进行爬取。
声名一下由于chrome的高级版本宣布不支持selenium所以博主将浏览器换成了Firefox。
import requests from selenium import webdriver import os import pymysql def hide(): options = webdriver.FirefoxOptions() options.add_argument('-headless') driver = webdriver.Firefox(options=options) return driver def Gethtml(url): driver=hide() driver.get(url) s = driver.find_elements_by_css_selector("div[class='slist'] li a") if str(s[-1].get_attribute("href")).split("/")[-2] == "4kmeinv": geturl(s[:-1]) else: geturl(s) print(s[-1].get_attribute("href")) Gethtml(s[-1].get_attribute("href")) def huoqvpicture(url): driver = webdriver.Chrome(options=hide()) driver.get(url) s=driver.find_element_by_css_selector("div[class='photo-pic'] a img") print(s.get_attribute("title")) insert(url,s.get_attribute("src"),s.get_attribute("title")) GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title"))) def GetPicture(url,name): root = "../dist/" path =root + name.replace(" ","")+".jpg" try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(url) with open(path, 'wb') as f: f.write(r.content) f.close() print("文件保存成功") else: print("文件已存在") except: print("爬取失败") def geturl(s): for i in s: print(i.get_attribute("href")) if not qvchong(i.get_attribute("href")): huoqvpicture(str(i.get_attribute("href"))) def insert(html,jpg,name): con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8") cur=con.cursor() html=str(html) jpg=str(jpg) name=str(name) sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');" print("sql") #cur.execute(sql) con.commit() def qvchong(i): con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian", charset="utf8") cur=con.cursor() sql="select html from suoyin" cur.execute(sql) results = cur.fetchall() i=(str(i),) if i in results: print("数据已存在") return True else: return False def main(): url="https://pic.netbian.com/4kmeinv/index.html" Gethtml(url) main()
import requests from selenium import webdriver import os import pymysql import time def hide(): options = webdriver.FirefoxOptions() options.add_argument('-headless') driver = webdriver.Firefox(options=options) return driver def huoqvpicture(url): driver = hide() driver.get(url) s=driver.find_element_by_css_selector("div[class='photo-pic'] a img") print(s.get_attribute("title")) insert(url,s.get_attribute("src"),s.get_attribute("title")) GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title"))) driver.close() def GetPicture(url,name): root = "../dist/" path =root + name.replace(" ","")+".jpg" try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(url) with open(path, 'wb') as f: f.write(r.content) f.close() print("文件保存成功") else: print("文件已存在") except: print("爬取失败") def geturl(s): for i in s: print(i.get_attribute("href")) if not qvchong(i.get_attribute("href")): huoqvpicture(str(i.get_attribute("href"))) def insert(html,jpg,name): con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8") cur=con.cursor() html=str(html) jpg=str(jpg) name=qvdian(str(name)) sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');" print("插入一条数据") cur.execute(sql) con.commit() def qvchong(i): con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian", charset="utf8") cur=con.cursor() sql="select html from suoyin" cur.execute(sql) results = cur.fetchall() i=(str(i),) if i in results: print("数据已存在") return True else: return False def geisuo(driver,a): s = driver.find_elements_by_css_selector("div[class='slist'] li a") print(a) a = a + 1 if str(s[-1].get_attribute("href")).split("/")[-2] == "result": geturl(s[:-1]) else: geturl(s) bt=driver.find_element_by_class_name("nextpage") bt.click() geisuo(driver,a) def click(url): driver = hide() driver.implicitly_wait(3) driver.get(url) keyboard = driver.find_element_by_name("keyboard") time.sleep(1) keyboard.send_keys("美女") bt = driver.find_element_by_name("submit") time.sleep(1) bt.click() geisuo(driver,1) def qvdian(s): s=str(s) ls=s.split("'") s="".join(ls) return s def main(): url="https://pic.netbian.com/e/search/result/index.php?page=1&searchid=16" click(url) main()
def hide():
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(options=options)
return driver
对更换浏览器后对无头设置进行了简单的修改,现在是直接返回一个浏览器,提高了代码运行效率。
def click(url):
driver = hide() #创建无头浏览器
driver.implicitly_wait(3) #设计浏览器等待时间
driver.get(url) #获取html页面
keyboard = driver.find_element_by_name("keyboard")
#找到搜索框
time.sleep(1) #等待1秒(拟人化操作)
keyboard.send_keys("美女")
#搜索框输入相应的关键字
bt = driver.find_element_by_name("submit")
#找到搜索按钮
time.sleep(1) #等待1秒(拟人化操作)
bt.click() #点击搜索按钮
geisuo(driver,1) #将浏览器和页数作为参数传递
这段代码可以说是整个新增代码中的核心代码,他负责进行关键字搜索。
def geisuo(driver,a): s = driver.find_elements_by_css_selector("div[class='slist'] li a") #获取相关的url(最后一个是下一页) print(a) #打印当前页页数,方便人了解进度 a = a + 1 #页数加一 if str(s[-1].get_attribute("href")).split("/")[-2] == "result": #查看是否到达最后一页 geturl(s[:-1]) #除最后一个进行爬取(未到达最后一页) else: geturl(s) #全部爬取(到达最后一页) bt=driver.find_element_by_class_name("nextpage") #找到下一页的点击按钮,进行翻页 bt.click() #点击下一页 geisuo(driver,a) #调用自身重新获取数据
之前翻页都是靠重新获取url,此次采用了鼠标点击。
def huoqvpicture(url):
driver = hide()
driver.get(url)
s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
print(s.get_attribute("title"))
insert(url,s.get_attribute("src"),s.get_attribute("title"))
GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))
driver.close() #关掉浏览器,节省内存
建议大家在所有代码写完之后在开启无头模式,博主之前没爬取一张图片都会开一个浏览器也不关,开着无头模式我也不知道,这样很耗内存。
def qvdian(s):
s=str(s) #将述据转化为字符串
ls=s.split("'") #以单引号为边界切开
s="".join(ls) #在直接合并成字符串
return s
在数据插入数据库是,如果里面由 ‘ 会导致插入失败,写一个小函数去掉这个单引号,修改一个bug。
selenium的大部分知识就结束了,博主开始学习爬虫框架scrapy,学完之后基本就学完了python所有的爬虫内容,selinium文章结束。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。