赞
踩
爬虫是什么?
- 每个网站都有爬虫协议,(例如:https://www.baidu.com/robots.txt,这里会写清楚哪些允许 哪些不被允许)
- 可见即可爬(技术上)
- 违法的:擦边球
一、request模块(模拟发请求的模块)
- 安装:pip3 install requests. ---urllib,urllib2 (这两个是py内置的),requests模块是基于这两个模块封装的#**** 基本使用 ****#导入模块#import requests#
## 发送get请求,有返回结果#res = requests.get('https://www.baidu.com')#
## 请求回来的内容#print(res.text)#
#with open('a.html','w',encoding='utf-8') as f:#f.write(res.text)#
#
## 请求返回的状态码#print(res.status_code)
requests模块介绍
#**** 携带参数 中文需要编码****#import requests#from urllib.parse import urlencode#
#key = input('请输入要搜索的内容')## 如果携带参数是中文或者其他特殊字符要做转码#key_search = urlencode({'wd':key})## print(key_search)#
## url = 'https://www.baidu.com/s?%s'%key_search#url = 'https://www.baidu.com/s?%s'%key_search#
#
## 反扒之一:携带http头信息 user-agent#res = requests.get(url,#headers={#'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#}#
#)#
#
#
#
#
#
#with open('a.html','w',encoding='utf-8') as f:#f.write(res.text)
request简单使用-1
#每次编码比较复杂,直接使用requests模块的参数#import requests#
#key = input('请输入要搜索的内容')#
## 反扒之一:携带http头信息 user-agent#res = requests.get('https://www.baidu.com/s',#headers={#'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#},## get形式携带的参数#params={'wd':key,'pn':70}#)#
#print(res.text)#
#with open('a.html','w',encoding='utf-8') as f:#f.write(res.text)
#cookie#import requests#
#Cookies={ 'user_session':'2OjTrCoCXigz5gB7trCsYfMBl0jQ-abqjXdCcas9UqwVmD7y',#}#
#response=requests.get('https://github.com/settings/emails',#cookies=Cookies) #github对请求头没有什么限制,我们无需定制user-agent,对于其他网站可能还需要定制#
#
#print('lich_qiu@163.com' in response.text) #True
requests简单使用-2
-get参数介绍
params=字典(get形式传的参数)
headers=字典()- User-Agent : 客户端类型-Referer : 从哪个地址调过来的(上一个地址),图片防盗链-Host :- Cookie : '字符串'Cookie : {'user_session':'xxx'} 因为cookie比较特殊,经常用的到。正常应该放在请求头当中,requests模块单独处理了cookie-post参数介绍
params
headers
cookie
data:请求体的数据,默认用urlencoded格式
json:传字典,这样发送的请求编码格式是:'content-type': 'application/json'allow_redirect=False 是否允许重定向,默认是True,一般不会去更改。#第一步:向https://github.com/login 发送get请求,#import requests#import re#
#res_login = requests.get('https://github.com/login',#headers={#'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#}#)## print(res_login.text)#
## 返回数据中有个authenticity_token 取出来## re.S 把这个字符串当作一行#authenticity_token = re.findall(r'name="authenticity_token".*?value="(.*?)"', res_login.text, re.S)[0]#print(authenticity_token)#
## 取出没有登入的cookie#login_cookie = res_login.cookies.get_dict()#print(login_cookie)#
## 第二步:向https://github.com/session 携带用户名+密码并发送post请求#
#data = {#'commit': 'Sign in',#'utf8': '✓',#'authenticity_token': authenticity_token,#'login': 'lich_qiu@163.com',#'password': 'zhang319!',#'webauthn-support': 'supported'#}#
#res = requests.post(url='https://github.com/session ',#
## 请求体的数据#data=data,## 需要携带没有通过认证的cookie#cookies=login_cookie,#
#)#
## 正常登入成功,返回cookie,取出cookie,下次发请求,携带着cookie## res.cookies.get_dict() 把返回的cookie转成字典#res_cookie = res.cookies.get_dict()#print(res_cookie)#
## 第三步:访问https://github.com/settings/emails ,判断lich_qiu@163.com 是否在返回的数据中#
#response = requests.get('https://github.com/settings/emails',#headers={#'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',#'Referer': 'https://github.com/settings/profile'#},#cookies=res_cookie,#
#) # github对请求头没有什么限制,我们无需定制user-agent,对于其他网站可能还需要定制#
#print('lich_qiu@163.com' in response.text) # True
get和post介绍及简单使用
#编码问题#import requests#response = requests.get('http://www.autohome.com/news')#
## 当前页面编码方式#print(response.apparent_encoding)#
## 将编码方式改成gbk#response.encoding = 'gbk'#print(response.text)
编码问题
#爬图片#比较小的文件,可以一次性把content爬下来
#import requests#res = requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1556732811646&di=2bd8396b35047f33fbcd6b023f5787b7&imgtype=0&src=http%3A%2F%2Fs15.sinaimg.cn%2Fmw690%2F0066UWNtgy6Viz3mEBoce%26690')#
#with open('a.jpg','wb') as f:#f.write(res.content)
爬图片
#爬视频#文件较大,有个iter_content()方法来循环爬
#import requests#
#res = requests.get('http://static.yximgs.com/s1/videos/www_main-059ce9beee.mp4')#with open('a.mp4','wb')as f:#for i in res.iter_content():#f.write(i)
爬视频
#解析json#import requests#
#response = requests.get('http://httpbin.org/get')#
#import json#res1 = json.loads(response.text) # 太麻烦#
#res2 = response.json() #直接获取json数据#
#print(res1 == res2) #结果是True,结果一致
解析json
-响应responseprint(respone.text) ---输出文本的内容print(respone.content) ---输出二进制的内容print(respone.status_code) ---状态码print(respone.headers) ---响应头print(respone.cookies) ---返回的cookieprint(respone.cookies.get_dict()) ---把返回的cookie转成字典格式print(respone.cookies.items()) ---字典.items()print(respone.url) ---要重定向的地址print(respone.history) ---正常返回的数据print(respone.encoding) --- 返回数据的编码格式
响应头使用介绍
二、requests模块高级用法
#1 ssl cert verification
#verify = False 不认证证书#import requests#携带证书#response = requests.get('https://www.12306.cn',#cert = ('/path/server.crt/path/key'))
1 ssl cert verification
#2 使用代理#http代理#import requests#
#proxies = {#'http':'http://lich:123@112.85.151.216:9999', # 带用户密码的代理,@符号前面是用户名与密码#'http':'223.241.116.173:8010',#'https':'https://localhost:8888'#}#
#response = requests.get('https://www.12306.cn',proxies=proxies)#print(response.status_code)
2 使用代理
#socket代理#import requests#proxies = {#'http':'socks5://lich:123@112.85.151.216:9999', # 带用户密码的代理,@符号前面是用户名与密码## 'http':'socks5://223.241.116.173:8010',## 'https':'socks5://localhost:8888'#}#
#response = requests.get('https://www.12306.cn',proxies=proxies)#
#print(response.status_code)
3 socket代理
#超时设置#import requests#response = requests.get('https://www.12306.cn',timeout = 0.0001)
4 超时设置
#上传文件#import requests#files = {#'file':open('a.jpg','rb')#}#response = requests.post('http://httpbin.org/post',files = files)#print(response.status_code)
5 上传文件
三、爬虫项目案例
#单线程爬取#import requests#import re#import os#
## 通用的通过地址获取页面内容的方法:#def get_page(url):#ret = requests.get(url)#if ret.status_code == 200:#return ret.text#
#
#def parse_res(text):## #urls = re.findall(r'class="categoryem".*?href="(.*?)" ',text,re.S)#print(urls)#for url in urls:#print(url)#yield 'https://www.pearvideo.com/' + url#
#
#def parse_detail(text):## print(text)#movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0]## print('视频文件的实际地址:',movie_url)#return movie_url#
#def base_dir():#base = os.path.dirname(os.path.abspath(__file__))#return base#
#
#def download_movie(url):#import time#movie_content = requests.get(url)#file_name = str(time.time())+'.mp4'#with open('%s/download/%s'%(base_dir(),file_name),'wb')as f:#f.write(movie_content.content)#
#
#
#if __name__ == '__main__':#res = get_page('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=1')## res是这个页面的内容#urls = parse_res(res)#for url in urls:#try:#res_detail = get_page(url)#movie_url = parse_detail(res_detail)#download_movie(movie_url)#except Exception as e:#print(e)
单线程爬梨视频
#多线程
importrequestsimportreimportosfrom concurrent.futures importThreadPoolExecutor#先生成一个有60个线程的池
pool = ThreadPoolExecutor(60)#通用的通过地址获取页面内容的方法:
defget_page(url):
ret=requests.get(url)if ret.status_code == 200:returnret.textdefparse_res(text):#从text中取出上个函数执行的返回结果
text =text.result()
urls= re.findall(r'class="categoryem".*?href="(.*?)"',text,re.S)print(urls)for url inurls:print(url)#yield 'https://www.pearvideo.com/' + url
pool.submit(get_page,'https://www.pearvideo.com/'+url).add_done_callback(parse_detail)defparse_detail(text):#print(text)
text =text.result()
movie_url= re.findall('srcUrl="(.*?)"',text,re.S)[0]#print('视频文件的实际地址:',movie_url)
pool.submit(download_movie,movie_url)defbase_dir():
base= os.path.dirname(os.path.abspath(__file__))
base= os.path.join(base,'download')returnbasedefdownload_movie(url):importtime
movie_content=requests.get(url)
file_name= str(time.time())+'.mp4'file=os.path.join(base_dir(),file_name)if movie_content.status_code == 200:
with open(file,'wb')as f:
f.write(movie_content.content)if __name__ == '__main__':for i in range(3):
url= 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=%s'%(i*12+1)#把获取页面的方法提交到线程中
#add_done_callback()当该线程执行完成后,执行的方法
pool.submit(get_page,url).add_done_callback(parse_res)
多线程爬梨视频
#导入 beautifulsoup模块
from bs4 importBeautifulSoupimporttimeimportos#https://www.autohome.com.cn/news/2/#liststart
for i in range(1,10):
url= 'https://www.autohome.com.cn/news/%s/#liststart'%i
ret=requests.get(url)#print(ret.text)
#soup = BeautifulSoup(ret.text,'lxml')
soup = BeautifulSoup(ret.text,'html.parser')
ul= soup.find(name='ul',attrs={'class':'article'})
li_list= ul.find_all(name='li')for li inli_list:try:#取出新闻的url
news_url = 'https:' + li.find(name='a').get('href') #取出属性
news_title = li.find(name='h3').text #取出h3标签的文本
news_desc = li.find(name='p').text #取出新闻的简介
news_img = 'https:' + li.find(name='img').get('src') #取到新闻的图片
print('''新闻标题:%s
新闻摘要:%s
新闻地址:%s
新闻图片地址:%s'''%(news_title,news_desc,news_url,news_img)
)#下载新闻的图片
response =requests.get(news_img)
time_name= str(time.time()) + '.jpg'base_path= os.path.dirname(os.path.abspath(__file__))
download_path= os.path.join(base_path,'download')
file_name=os.path.join(download_path,time_name)
with open(file_name,'wb')as f:
f.write(response.content)exceptException as e:print(e)
多线程爬汽车之家新闻
#import requests
#ret = requests.get('https://dig.chouti.com',#headers={#'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#}#)#
#
#print(ret.status_code)#print(ret.text)
#模拟登入,状态码:9999登入成功,并不能成功点赞#ret = requests.post('https://dig.chouti.com/login',#headers={#'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#
#},#data={#'phone': '8618901847206',#'password': '123.abcd',#'oneMonth': 1#}#)## print(ret.text)## 取出登入成功后的cookie#cookie = ret.cookies.get_dict()#
#
## 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求#res = requests.post('https://dig.chouti.com/link/vote?linksId=25944651',#
#headers={#'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',#'referer':'https://dig.chouti.com/'#
#},#
#cookies = cookie#)#print(res.text)
抽屉自动点赞-分析
#第一步:先打开抽屉首页
importrequestsfrom bs4 importBeautifulSoup
ret= requests.get('https://dig.chouti.com/',
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
},
)
ret_cookie=ret.cookies.get_dict()#第二步:模拟登入
res = requests.post('https://dig.chouti.com/login',
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'},
cookies=ret_cookie,
data={'phone': '8618901847206','password': '123.abcd','oneMonth': 1}
)print(res.text)
res_cookie=res.cookies.get_dict()## 第三步:给文章点赞(这个写死了,用下面那个第三步)#
## 给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求#response = requests.post('https://dig.chouti.com/link/vote?linksId=25944651',#
#headers={#'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',#'referer':'https://dig.chouti.com/'#
#},#
#cookies = ret_cookie#)#print(response.text)
#登入成功第三步,进入首页
post_url_list=[]for i in range(5,10):
response= requests.get('https://dig.chouti.com/all/hot/recent/%s'%i,
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'}
)
soup= BeautifulSoup(response.text, 'html.parser')
div_all= soup.find(name='div',attrs={'class':'content-list','id':'content-list'})
div_list= div_all.find_all(name='div',attrs={'class':'news-pic'})for div indiv_list:try:
news_id= div.find(name='img').get('lang')#'https://dig.chouti.com/link/vote?linksId=%s'%news_id
post_url = 'https://dig.chouti.com/link/vote?linksId=%s'%news_id
post_url_list.append(post_url)exceptException as e:print('这里报错了哟',e)#print(post_url_list)
#第四步:循环给文章点赞#给文章点赞,向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求
for url inpost_url_list:
up_news=requests.post(url,
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer':'https://dig.chouti.com/'},
cookies=ret_cookie
)print(up_news.text)
抽屉自动点赞-实现
#第一步:先打开抽屉首页
importrequestsfrom bs4 importBeautifulSoup
ret= requests.get('https://dig.chouti.com/',
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
},
)
ret_cookie=ret.cookies.get_dict()#第二步:模拟登入
res = requests.post('https://dig.chouti.com/login',
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'},
cookies=ret_cookie,
data={'phone': '8618901847206','password': '123.abcd','oneMonth': 1}
)print(res.text)
res_cookie=res.cookies.get_dict()#登入成功第三步,进入首页,获取新闻ID并拼接url加入列表
post_url_list=[]
news_id_list=[]for i in range(5, 10):
response= requests.get('https://dig.chouti.com/all/hot/recent/%s' %i,
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'}
)
soup= BeautifulSoup(response.text, 'html.parser')
div_all= soup.find(name='div', attrs={'class': 'content-list', 'id': 'content-list'})
div_list= div_all.find_all(name='div', attrs={'class': 'news-pic'})for div indiv_list:try:
news_id= div.find(name='img').get('lang')
news_id_list.append(news_id)#'https://dig.chouti.com/link/vote?linksId=%s'%news_id
post_url = 'https://dig.chouti.com/link/vote?linksId=%s' %news_id
post_url_list.append(post_url)exceptException as e:print('这里报错了哟', e)#第四步:循环给文章取消点赞#给文章取消点赞,向https://dig.chouti.com/vote/cancel/vote.do 发送post请求,并携带form_data= {linksId: 25933276}
url= 'https://dig.chouti.com/vote/cancel/vote.do'
for news_id innews_id_list:
up_news=requests.post(url,
headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'},
cookies=ret_cookie,
data={'linksId': news_id
}
)print(up_news.text)
抽屉自动取消点赞-实现
四、bs4模块介绍
#html_doc = """#
The Dormouse's story##The Dormouse's story
##
Once upon a time there were three little sisters; and their names were#Elsie,#Lacie and#Tillie;#and they lived at the bottom of a well.
##
...
#"""#1、用法#from bs4 import BeautifulSoup#soup=BeautifulSoup(html_doc,'lxml')## soup=BeautifulSoup(open('a.html'),'lxml')#
#print(soup.p) #存在多个相同的标签则只返回第一个#print(soup.p.b.text) #The Dormouse's story#print(soup.p.b.get('class')) #['boldest']#print(soup.a) #存在多个相同的标签则只返回第一个#
##2、获取标签的名称#print(soup.p.name)#
##3、获取标签的属性#print(soup.p.attrs)#
##4、获取标签的内容#print(soup.p.string) # p下的文本只有一个时,取到,否则为None#print(soup.p.strings) #拿到一个生成器对象, 取到p下所有的文本内容#print(soup.p.text) #取到p下所有的文本内容#for line in soup.stripped_strings: #去掉空白#print(line)#
#
#'''#如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None,如果只有一个子节点那么就输出该子节点的文本,比如下面的这种结构,soup.p.string 返回为None,但soup.p.strings就可以找到所有文本#
#哈哈哈哈###
##5、嵌套选择#print(soup.head.title.string)#print(soup.body.a.string)#
#
##6、子节点、子孙节点#print(soup.p.contents) #p下所有子节点#print(soup.p.children) #得到一个迭代器,包含p下所有子节点#
#for i,child in enumerate(soup.p.children):#print(i,child)#
#print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来#for i,child in enumerate(soup.p.descendants):#print(i,child)#
##7、父节点、祖先节点#print(soup.a.parent) #获取a标签的父节点#print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...#
#
##8、兄弟节点#print('=====>')#print(soup.a.next_sibling) #下一个兄弟#print(soup.a.previous_sibling) #上一个兄弟#
#print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象#print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象
1 遍厉文档树
html_doc = """
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were
Lacie and
and they lived at the bottom of a well.
...
"""#五种过滤器: 字符串,正则表达式, 列表, True, 方法#soup.find()#find(self, name=None, attrs={}, recursive=True, text=None,**kwargs):#name:标签名, attrs:属性, text:文本内容, recursive=False表示不递归查找、默认是True, **kwargs
from bs4 importBeautifulSoup#1 字符串精准匹配#soup = BeautifulSoup(html_doc,'lxml')## ret = soup.find(name='body') #指定条件1## ret = soup.find(attrs={'class':'title'}) #指定条件2#ret = soup.find(text="The Dormouse's story") #指定条件3#print(ret)#print(type(ret))
#2 正则表达式匹配#import re#soup = BeautifulSoup(html_doc,'lxml')## ret = soup.find(name=re.compile('^p'))## ret = soup.find(attrs={'class':re.compile('^s')})#ret = soup.find(name='a',text=re.compile('^L'))#print(ret)
#3 列表匹配
soup = BeautifulSoup(html_doc,'lxml')#ret =soup.find_all(name=['a','b'])#ret =soup.find_all(attrs={'class':['title','sister']})
ret =soup.find_all(text=['Elsie','Lacie'])print(ret)
2 搜索文档树
五、selenium模块介绍
#最基本的用法#from selenium import webdriver#import time#
#
##webdriver.Chrome()得到一个对象,相当于我的浏览器#browser = webdriver.Chrome()#browser.get('https://www.baidu.com')#print(browser.page_source)#
#time.sleep(2)#
##关闭浏览器(务必)#browser.close()
#
#from selenium import webdriver#import time
基本使用
#### 所有选择器用法
#1、find_element_by_id 通过id查找
#2、find_element_by_link_text 通过连接文字
#3、find_element_by_partial_link_text 通过连接文字模糊查找
#4、find_element_by_tag_name 通过标签查找
#5、find_element_by_class_name 通过类名查找
#6、find_element_by_name
#7、find_element_by_css_selector
#8、find_element_by_xpath#### 所有选择器用用法
选择器介绍
#简单使用1 打开百度,在百度搜索栏中搜索美女关键字#try:#browser = webdriver.Chrome()#browser.get('https://www.baidu.com')#
#time.sleep(2)#
#search_input = browser.find_element_by_id('kw')#key = input('请输入要搜索的内容')#search_input.send_keys(key)#time.sleep(5)#
#
#
#
#except Exception as e:#print(e)#finally:#browser.close()
简单使用-1
#简单使用2 打开百度 并完成登入#try:#browser = webdriver.Chrome()## 表示取控件的时候,如果取不到,则等待3秒## 隐式等待#browser.implicitly_wait(3)#browser.get('https://www.baidu.com')#
#time.sleep(2)#
#login_btn = browser.find_element_by_link_text('登录')#login_btn.click()#user_login = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')#user_login.click()#username_input = browser.find_element_by_id('TANGRAM__PSP_10__userName')#username_input.send_keys('13681878977')#password_input = browser.find_element_by_id('TANGRAM__PSP_10__password')#password_input.send_keys('zhang319!')#submit_btn = browser.find_element_by_id('TANGRAM__PSP_10__submit')#submit_btn.click()#time.sleep(5)#
#search_input = browser.find_element_by_id('kw')#search_input.send_keys('名侦探柯南')#time.sleep(10)#
#
#except Exception as e:#print(e)#finally:#browser.close()
简单使用-2
#简单使用3 爬取京东商品信息
#from selenium import webdriver#from selenium.webdriver.common.keys import Keys#import time#
#
#def get_goods(browser):#li_list = browser.find_elements_by_class_name('gl-item')#for li in li_list:#goods_price = li.find_element_by_css_selector('.p-price i').text## print(goods_price)#
#goods_comment = li.find_element_by_css_selector('.p-commit strong a').text## print(goods_comment)#
#goods_name = li.find_element_by_css_selector('.p-name-type-2 a').get_attribute('title')## print(goods_name)#
#goods_url = li.find_element_by_css_selector('.p-name-type-2 a').get_attribute('href')#
#goods_img = li.find_element_by_css_selector('.p-img a img').get_attribute('src')#if not goods_img:#goods_img = 'https:'+li.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')#
#print(#'''#商品名称:%s#商品价格:%s#商品评论数:%s#商品详情地址:%s#商品图片地址:%s#'''% (goods_name, goods_price, goods_comment, goods_url, goods_img)#)#
#next_page = browser.find_element_by_partial_link_text('下一页')#time.sleep(2)#next_page.click()#get_goods(browser)#
#
#
#
#
#
#
#
#def spider():#try:#browser = webdriver.Chrome()#browser.implicitly_wait(3)#
#browser.get('https://www.jd.com')#
#search_input = browser.find_element_by_id('key')#search_input.send_keys('手机')#search_input.send_keys(Keys.ENTER)#time.sleep(5)#
#
##取出页面中商品信息#get_goods(browser)#
#
#except Exception as e:#print(e)#finally:#browser.close()#
#if __name__ == '__main__':#spider()
简单使用-3 (爬取京东商品信息)
#简单实用4 模拟浏览器前进后退#import time#from selenium import webdriver#
#browser=webdriver.Chrome()#browser.get('https://www.baidu.com')#browser.get('https://www.taobao.com')#browser.get('http://www.sina.com.cn/')#
#browser.back()#time.sleep(10)#browser.forward()#browser.close()
#执行js代码#from selenium import webdriver#import time#
#browser=webdriver.Chrome()#browser.get('https://www.baidu.com')#browser.execute_script('alert(1234)')##执行后js代码后,.close()不能生效#browser.close()
#选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式#import time#from selenium import webdriver#
#browser=webdriver.Chrome()#browser.get('https://www.baidu.com')#browser.execute_script('window.open()')#
#print(browser.window_handles) #获取所有的选项卡#browser.switch_to_window(browser.window_handles[1])#browser.get('https://www.taobao.com')#time.sleep(5)#browser.switch_to_window(browser.window_handles[0])#browser.get('https://www.sina.com.cn')#browser.close()
#控制鼠标滑动#from selenium import webdriver#from selenium.webdriver import ActionChains#from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR#from selenium.webdriver.common.keys import Keys # 键盘按键操作#from selenium.webdriver.support import expected_conditions as EC#from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素#import time#
#driver = webdriver.Chrome()#driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')#wait=WebDriverWait(driver,3)## driver.implicitly_wait(3) # 使用隐式等待#
#try:#driver.switch_to.frame('iframeResult') ##切换到iframeResult#sourse=driver.find_element_by_id('draggable')#target=driver.find_element_by_id('droppable')#
##方式一:基于同一个动作链串行执行## actions=ActionChains(driver) #拿到动作链对象## actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行## actions.perform()#
##方式二:不同的动作链,每次移动的位移都不同#ActionChains(driver).click_and_hold(sourse).perform()#distance=target.location['x']-sourse.location['x']#
#track=0#while track < distance:#ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()#track+=2#
#ActionChains(driver).release().perform()#
#time.sleep(10)#
#finally:#driver.close()
简单使用-4 模拟浏览器操作
六、微信自动回复机器人
#好友性别饼状图#from wxpy import *#from pyecharts import Pie## import webbrowser#bot = Bot(cache_path=True) # 注意手机登陆确认#
## 拿到所有朋友对象,放到列表里#friends = bot.friends()#
#attr = ['男朋友','女朋友','未知性别']#value = [0,0,0]#
#for friend in friends:#if friend.sex == 1: #1代表男性#value[0] +=1#elif friend.sex == 2: #2代表女性#value[1] += 1#else:#value[2] += 1#
#pie = Pie('朋友男女比例')#pie.add("",attr, value, is_label_show=True)#
## 图表名称str , 属性名称list , 属性所对应的值list , is_label_show是否现在标签#pie.render('sex.html')#
##打开浏览器#from selenium import webdriver#import time#browser = webdriver.Chrome()#browser.get('/Users/lich/PycharmProjects/w3spider_Proj/sex.html')#time.sleep(10)#browser.close()
好友统计饼状图
#好友地域省份分布图#from wxpy import *#from pyecharts import Map#from pyecharts import Pie#import webbrowser#bot = Bot(cache_path=True) # 注意手机登陆确认#
## 拿到所有朋友对象,放到列表里#friends = bot.friends()#
#area_dic = {} #定义一个省份字典#
#for friend in friends:#if friend.province not in area_dic:#area_dic[friend.province] = 1#else:#area_dic[friend.province] += 1#attr = area_dic.keys()#value = area_dic.values()#
#map = Map('好朋友们的地域分布', width= 1200,height=600)#map.add(#"好友地域分布",#attr,#value,#maptype='china',#is_visualmap = True, #结合体VisualMap#
#)#
## is_visualmap -> bool 是否使用视觉映射组件#map.render('area.html')
好友省份地域分布图
#全好友自动回复:#from wxpy import *#bot=Bot(cache_path=True)#
#@bot.register()#def recv_send_msg(recv_msg):#print('收到的消息:',recv_msg.text) # recv_msg.text取得文本#return '自动回复:%s' %recv_msg.text#
## 进入Python命令行,让程序保持运行#embed()
全好友自动回复
#自动给老婆回复信息#from wxpy import *#bot=Bot(cache_path=True)#
#girl_friend=bot.search('老婆')[0]#print(girl_friend)#
#@bot.register() # 接收从指定好友发来的消息,发送者即recv_msg.sender为指定好友girl_friend#def recv_send_msg(recv_msg):## print('收到的消息:',recv_msg.text) # recv_msg.text取得文本#if recv_msg.sender == girl_friend:#recv_msg.forward(bot.file_helper,prefix='老婆留言: ') #在文件传输助手里留一份,方便自己忙完了回头查看#ms='老婆最美丽,我对老婆的爱如滔滔江水,连绵不绝'#print('>>>给老婆回复的:', ms)#return ms#给老婆回一份#
#embed()
自动给老婆回复-1
#使用图灵机器人自动回复
importjsonimportrequestsfrom wxpy import *bot= Bot(cache_path=True)#调用图灵机器人API,发送消息并获得机器人的回复
defauto_reply(text):
url= "http://www.tuling123.com/openapi/api"api_key= "9df516a74fc443769b233b01e8536a42"payload={"key": api_key,"info": text,
}
r= requests.post(url, data=json.dumps(payload))
result=json.loads(r.content)#return "[来自智能机器人] " + result["text"]
return result["text"]
girl_friend=bot.search('老婆')[0]
@bot.register()defforward_message(msg):if msg.sender ==girl_friend:returnauto_reply(msg.text)
embed()
自动给老婆回复-2 (使用图灵机器人)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。