赞
踩
记录时间:2021年2月1日
验证码是一种反爬机制
需要识别验证码图片中的数据,用于模拟登录操作
识别验证码的操作:
- import urllib.parse, urllib.request, sys
- import re
-
- host = 'http://apigateway.jianjiaoshuju.com'
- path = '/api/v_1/yzmCustomized.html'
- method = 'POST'
- appcode = 'xxxxxxxxxxx'
- appKey = 'xxxxxxxxxxx'
- appSecret = 'xxxxxxxxxxx'
- querys = ''
- bodys = {}
- url = host + path
- # 图片的base64编码
- bodys[
- 'v_pic'] = ''
- bodys['pri_id'] = 'ne'
- post_data = bytes(urllib.parse.urlencode(bodys), encoding="utf-8")
- request = urllib.request.Request(url, post_data)
- request.add_header('appcode', appcode)
- request.add_header('appKey', appKey)
- request.add_header('appSecret', appSecret)
- request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
- response = urllib.request.urlopen(request).read()
- if response:
- content = response.decode("utf-8")
- v_code = re.findall('"v_code":"(.*?)"', content)[0]
- print(v_code)

- import requests
- from lxml import etree
- import urllib.parse, urllib.request
- import re
- import base64
-
-
- # 获取验证码的文本
- def getCodeText(v_pic, pri_id):
- host = 'http://apigateway.jianjiaoshuju.com'
- path = '/api/v_1/yzmCustomized.html'
- method = 'POST'
- appcode = 'xxxxxxxxxxxx'
- appKey = 'xxxxxxxxxxxx'
- appSecret = 'xxxxxxxxxxxx'
- querys = ''
- bodys = {}
- url = host + path
- v_code = '获取失败'
- bodys['v_pic'] = v_pic
- bodys['pri_id'] = pri_id
- post_data = bytes(urllib.parse.urlencode(bodys), encoding="utf-8")
- request = urllib.request.Request(url, post_data)
- request.add_header('appcode', appcode)
- request.add_header('appKey', appKey)
- request.add_header('appSecret', appSecret)
- request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
- response = urllib.request.urlopen(request).read()
- if response:
- content = response.decode("utf-8")
- v_code = re.findall('"v_code":"(.*?)"', content)[0]
- # print(v_code)
- return v_code
-
-
- # 获取base64编码后的v_pic
- def getv_pic(filepath):
- f = open(filepath, 'rb') # 第一个参数图像路径
- img_base64 = base64.b64encode(f.read()).decode('utf-8')
- f.close()
- return img_base64
-
-
- if __name__ == '__main__':
- baseurl = 'https://so.gushiwen.org/user/login.aspx'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
- }
- html = requests.get(url=baseurl, headers=headers)
- tree = etree.HTML(html.text)
- imgCode_url = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
- imgCode_data = requests.get(url=imgCode_url, headers=headers).content
- imgCode_path = './古诗文网模拟登录/imgCode.jpg'
- with open(imgCode_path, 'wb') as fp:
- fp.write(imgCode_data)
- CodeText = getCodeText(getv_pic(imgCode_path), 'ne')
- print(CodeText)

cookie:用来让服务器端记录客户端的相关状态。
session会话对象:
—作用:
1.可以进行请求的发送。
2.如果请求过程中产生了cookie,则该cookie会被自动存储/携带在该session对象中。
创建一个session对象——“session = requests.Session()”
使用session对象进行模拟登录post请求的发送(cookie就会被存储在session中)
session对象对个人主页对应的get请求进行发送(携带了cookie)
- import requests
- from lxml import etree
- import re
-
- if __name__ == '__main__':
- url = 'https://www.douban.com/'
- headers = {
- 'Cookie': 'll="118316"; bid=0FIL6vjWxhw; douban-fav-remind=1; __yadk_uid=4Z8xtdej0WrqXqPer14U40sfi6Lo0xob; _vwo_uuid_v2=DC9368AFA6BB2FE1F1B553BB855F45CF2|7738f80fafd129733109fbe70c7e4bbe; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1612173329%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DV6fFSEPFUqp8jF2t6jGBYVKdE4SOFTtJxVZxIGwCIXaX8NMPRGjkfdDgeAb0rIyA%26wd%3D%26eqid%3Ddcdce7840000177e000000046017d00b%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1976354768.1552034909.1612004998.1612173337.14; __utmc=30149280; __utmz=30149280.1612173337.14.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.23190; __gads=ID=6040484a5bfb4e2e-2293a631e7c50037:T=1612173464:R:S=ALNI_MZAuEevqcWL_eMHTmRcwRdUm3ptFA; __utmt=1; dbcl2="231900772:KYdGpxBvCEI"; ck=Y91S; _pk_id.100001.8cb4=009a280458d2bfa8.1569556611.7.1612174391.1612170160.; __utmb=30149280.16.10.1612173337',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
- }
- html = requests.get(url=url, headers=headers)
- tree = etree.HTML(html.text)
- span = tree.xpath('//*[@id="db-global-nav"]/div/div[1]/ul/li[2]/a/span[1]/text()')[0]
- # print(span)
- username = re.findall('(.*?)的帐号', span)[0]
- print(username)

- import requests
- from lxml import etree
- import urllib.parse, urllib.request
- import re
- import base64
-
-
- # 获取验证码的文本
- def getCodeText(v_pic, pri_id):
- host = 'http://apigateway.jianjiaoshuju.com'
- path = '/api/v_1/yzmCustomized.html'
- method = 'POST'
- appcode = 'xxxxxxxxxxxxxxxxxxx'
- appKey = 'xxxxxxxxxxxxxxxxxxx'
- appSecret = 'xxxxxxxxxxxxxxxxxxx'
- querys = ''
- bodys = {}
- url = host + path
- v_code = '获取失败'
- bodys['v_pic'] = v_pic
- bodys['pri_id'] = pri_id
- post_data = bytes(urllib.parse.urlencode(bodys), encoding="utf-8")
- request = urllib.request.Request(url, post_data)
- request.add_header('appcode', appcode)
- request.add_header('appKey', appKey)
- request.add_header('appSecret', appSecret)
- request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
- response = urllib.request.urlopen(request).read()
- if response:
- content = response.decode("utf-8")
- v_code = re.findall('"v_code":"(.*?)"', content)[0]
- # print(v_code)
- return v_code
-
-
- # 获取base64编码后的v_pic
- def getv_pic(filepath):
- f = open(filepath, 'rb') # 第一个参数图像路径
- img_base64 = base64.b64encode(f.read()).decode('utf-8')
- f.close()
- return img_base64
-
-
- if __name__ == '__main__':
- session = requests.Session()
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
- }
- url = "https://so.gushiwen.org/user/login.aspx"
-
- # 获取表单登录令牌
- html = session.get(url=url, headers=headers)
- tree = etree.HTML(html.text)
- VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
- VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
-
- # 下载验证码
- html = session.get(url=url, headers=headers)
- tree = etree.HTML(html.text)
- imgCode_src = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
- # print(imgCode_src)
- imgCode_data = session.get(url=imgCode_src, headers=headers).content
- imgCode_path = './古诗文网模拟登录/imgCode.jpg'
- with open(imgCode_path, 'wb') as fp:
- fp.write(imgCode_data)
- CodeText = getCodeText(getv_pic('./古诗文网模拟登录/imgCode.jpg'), 'ne')
- # print(CodeText)
-
- # 登录
- data = {
- '__VIEWSTATE': VIEWSTATE,
- '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
- 'from': '',
- 'email': 'xxxxxxxxxxxxxxxxxxx',
- 'pwd': 'xxxxxxxxxxxxxxxxxxx',
- 'code': CodeText,
- 'denglu': '登录'
- }
- html = session.post(url=url, headers=headers, data=data)
- with open('./古诗文网模拟登录/古诗文网.html', 'w', encoding='utf-8') as fp:
- fp.write(html.text)
-
-
- # 获取个人主页的个人信息(如:绑定手机的手机号)
- html = session.get(url='https://so.gushiwen.org/user/collect.aspx', headers=headers)
- tree = etree.HTML(html.text)
- phone = tree.xpath('//*[@id="mainSearch"]/div[3]/div[1]/div[3]/span/text()')[0]
- print(phone)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。