当前位置:   article > 正文

python爬虫笔记(3)___viewstate = tree.xpath('//*[@id="__viewstate"]/@

__viewstate = tree.xpath('//*[@id="__viewstate"]/@value')[0] __viewstategene

python爬虫笔记

记录时间:2021年2月1日

1、验证码识别

验证码是一种反爬机制

需要识别验证码图片中的数据,用于模拟登录操作

识别验证码的操作:

  • 人工肉眼识别(不推荐)
  • 第三方自动识别(推荐)

python验证码识别(例如使用尖叫数据)

  1. import urllib.parse, urllib.request, sys
  2. import re
  3. host = 'http://apigateway.jianjiaoshuju.com'
  4. path = '/api/v_1/yzmCustomized.html'
  5. method = 'POST'
  6. appcode = 'xxxxxxxxxxx'
  7. appKey = 'xxxxxxxxxxx'
  8. appSecret = 'xxxxxxxxxxx'
  9. querys = ''
  10. bodys = {}
  11. url = host + path
  12. # 图片的base64编码
  13. bodys[
  14. 'v_pic'] = ''
  15. bodys['pri_id'] = 'ne'
  16. post_data = bytes(urllib.parse.urlencode(bodys), encoding="utf-8")
  17. request = urllib.request.Request(url, post_data)
  18. request.add_header('appcode', appcode)
  19. request.add_header('appKey', appKey)
  20. request.add_header('appSecret', appSecret)
  21. request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
  22. response = urllib.request.urlopen(request).read()
  23. if response:
  24. content = response.decode("utf-8")
  25. v_code = re.findall('"v_code":"(.*?)"', content)[0]
  26. print(v_code)

 

需求1:古诗文网的验证码识别

  1. import requests
  2. from lxml import etree
  3. import urllib.parse, urllib.request
  4. import re
  5. import base64
  6. # 获取验证码的文本
  7. def getCodeText(v_pic, pri_id):
  8. host = 'http://apigateway.jianjiaoshuju.com'
  9. path = '/api/v_1/yzmCustomized.html'
  10. method = 'POST'
  11. appcode = 'xxxxxxxxxxxx'
  12. appKey = 'xxxxxxxxxxxx'
  13. appSecret = 'xxxxxxxxxxxx'
  14. querys = ''
  15. bodys = {}
  16. url = host + path
  17. v_code = '获取失败'
  18. bodys['v_pic'] = v_pic
  19. bodys['pri_id'] = pri_id
  20. post_data = bytes(urllib.parse.urlencode(bodys), encoding="utf-8")
  21. request = urllib.request.Request(url, post_data)
  22. request.add_header('appcode', appcode)
  23. request.add_header('appKey', appKey)
  24. request.add_header('appSecret', appSecret)
  25. request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
  26. response = urllib.request.urlopen(request).read()
  27. if response:
  28. content = response.decode("utf-8")
  29. v_code = re.findall('"v_code":"(.*?)"', content)[0]
  30. # print(v_code)
  31. return v_code
  32. # 获取base64编码后的v_pic
  33. def getv_pic(filepath):
  34. f = open(filepath, 'rb') # 第一个参数图像路径
  35. img_base64 = base64.b64encode(f.read()).decode('utf-8')
  36. f.close()
  37. return img_base64
  38. if __name__ == '__main__':
  39. baseurl = 'https://so.gushiwen.org/user/login.aspx'
  40. headers = {
  41. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
  42. }
  43. html = requests.get(url=baseurl, headers=headers)
  44. tree = etree.HTML(html.text)
  45. imgCode_url = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
  46. imgCode_data = requests.get(url=imgCode_url, headers=headers).content
  47. imgCode_path = './古诗文网模拟登录/imgCode.jpg'
  48. with open(imgCode_path, 'wb') as fp:
  49. fp.write(imgCode_data)
  50. CodeText = getCodeText(getv_pic(imgCode_path), 'ne')
  51. print(CodeText)

2、cookie

cookie:用来让服务器端记录客户端的相关状态。

  • 手动处理:通过抓包工具获取cookie值,将该值封装到headers中。(不建议)
  • 自动处理:
    —cookie的来源是哪里?
           —模拟登录post请求后,由服务器端创建。

session会话对象:
       —作用:

       1.可以进行请求的发送。
       2.如果请求过程中产生了cookie,则该cookie会被自动存储/携带在该session对象中。

创建一个session对象——“session = requests.Session()”

使用session对象进行模拟登录post请求的发送(cookie就会被存储在session中)

session对象对个人主页对应的get请求进行发送(携带了cookie)
 

需求2:爬取豆瓣个人信息的相应数据(手动cookie处理)

  1. import requests
  2. from lxml import etree
  3. import re
  4. if __name__ == '__main__':
  5. url = 'https://www.douban.com/'
  6. headers = {
  7. 'Cookie': 'll="118316"; bid=0FIL6vjWxhw; douban-fav-remind=1; __yadk_uid=4Z8xtdej0WrqXqPer14U40sfi6Lo0xob; _vwo_uuid_v2=DC9368AFA6BB2FE1F1B553BB855F45CF2|7738f80fafd129733109fbe70c7e4bbe; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1612173329%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DV6fFSEPFUqp8jF2t6jGBYVKdE4SOFTtJxVZxIGwCIXaX8NMPRGjkfdDgeAb0rIyA%26wd%3D%26eqid%3Ddcdce7840000177e000000046017d00b%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1976354768.1552034909.1612004998.1612173337.14; __utmc=30149280; __utmz=30149280.1612173337.14.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; push_noty_num=0; push_doumail_num=0; __utmv=30149280.23190; __gads=ID=6040484a5bfb4e2e-2293a631e7c50037:T=1612173464:R:S=ALNI_MZAuEevqcWL_eMHTmRcwRdUm3ptFA; __utmt=1; dbcl2="231900772:KYdGpxBvCEI"; ck=Y91S; _pk_id.100001.8cb4=009a280458d2bfa8.1569556611.7.1612174391.1612170160.; __utmb=30149280.16.10.1612173337',
  8. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
  9. }
  10. html = requests.get(url=url, headers=headers)
  11. tree = etree.HTML(html.text)
  12. span = tree.xpath('//*[@id="db-global-nav"]/div/div[1]/ul/li[2]/a/span[1]/text()')[0]
  13. # print(span)
  14. username = re.findall('(.*?)的帐号', span)[0]
  15. print(username)

需求3:古诗文网模拟登录(全过程,自动获取cookie)

  1. import requests
  2. from lxml import etree
  3. import urllib.parse, urllib.request
  4. import re
  5. import base64
  6. # 获取验证码的文本
  7. def getCodeText(v_pic, pri_id):
  8. host = 'http://apigateway.jianjiaoshuju.com'
  9. path = '/api/v_1/yzmCustomized.html'
  10. method = 'POST'
  11. appcode = 'xxxxxxxxxxxxxxxxxxx'
  12. appKey = 'xxxxxxxxxxxxxxxxxxx'
  13. appSecret = 'xxxxxxxxxxxxxxxxxxx'
  14. querys = ''
  15. bodys = {}
  16. url = host + path
  17. v_code = '获取失败'
  18. bodys['v_pic'] = v_pic
  19. bodys['pri_id'] = pri_id
  20. post_data = bytes(urllib.parse.urlencode(bodys), encoding="utf-8")
  21. request = urllib.request.Request(url, post_data)
  22. request.add_header('appcode', appcode)
  23. request.add_header('appKey', appKey)
  24. request.add_header('appSecret', appSecret)
  25. request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
  26. response = urllib.request.urlopen(request).read()
  27. if response:
  28. content = response.decode("utf-8")
  29. v_code = re.findall('"v_code":"(.*?)"', content)[0]
  30. # print(v_code)
  31. return v_code
  32. # 获取base64编码后的v_pic
  33. def getv_pic(filepath):
  34. f = open(filepath, 'rb') # 第一个参数图像路径
  35. img_base64 = base64.b64encode(f.read()).decode('utf-8')
  36. f.close()
  37. return img_base64
  38. if __name__ == '__main__':
  39. session = requests.Session()
  40. headers = {
  41. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
  42. }
  43. url = "https://so.gushiwen.org/user/login.aspx"
  44. # 获取表单登录令牌
  45. html = session.get(url=url, headers=headers)
  46. tree = etree.HTML(html.text)
  47. VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
  48. VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
  49. # 下载验证码
  50. html = session.get(url=url, headers=headers)
  51. tree = etree.HTML(html.text)
  52. imgCode_src = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
  53. # print(imgCode_src)
  54. imgCode_data = session.get(url=imgCode_src, headers=headers).content
  55. imgCode_path = './古诗文网模拟登录/imgCode.jpg'
  56. with open(imgCode_path, 'wb') as fp:
  57. fp.write(imgCode_data)
  58. CodeText = getCodeText(getv_pic('./古诗文网模拟登录/imgCode.jpg'), 'ne')
  59. # print(CodeText)
  60. # 登录
  61. data = {
  62. '__VIEWSTATE': VIEWSTATE,
  63. '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
  64. 'from': '',
  65. 'email': 'xxxxxxxxxxxxxxxxxxx',
  66. 'pwd': 'xxxxxxxxxxxxxxxxxxx',
  67. 'code': CodeText,
  68. 'denglu': '登录'
  69. }
  70. html = session.post(url=url, headers=headers, data=data)
  71. with open('./古诗文网模拟登录/古诗文网.html', 'w', encoding='utf-8') as fp:
  72. fp.write(html.text)
  73. # 获取个人主页的个人信息(如:绑定手机的手机号)
  74. html = session.get(url='https://so.gushiwen.org/user/collect.aspx', headers=headers)
  75. tree = etree.HTML(html.text)
  76. phone = tree.xpath('//*[@id="mainSearch"]/div[3]/div[1]/div[3]/span/text()')[0]
  77. print(phone)

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/347394
推荐阅读
相关标签
  

闽ICP备14008679号