当前位置:   article > 正文

Python3学习:urllib的使用方法_pathon 3 urllib. response,.read()

pathon 3 urllib. response,.read()

urllib是python的一个获取url(Uniform Resource Locators,统一资源定址符)了,可以利用它来抓取远程的数据进行保存,本文整理了一些关于urllib使用中的一些关于header,代理,超时,认证,异常处理处理方法。


1、简单读取网页信息

  1. import urllib.request
  2. response = urllib.request.urlopen('http://python.org/')
  3. html = response.read()

2、使用request

  1. import urllib.request
  2. req = urllib.request.Request('http://python.org/')
  3. response = urllib.request.urlopen(req)
  4. the_page = response.read()

3、发送数据,以登录知乎为例

  1. '''
  2. Created on 2016年5月31日
  3. @author: gionee
  4. '''
  5. import gzip
  6. import re
  7. import urllib.request
  8. import urllib.parse
  9. import http.cookiejar
  10. def ungzip(data):
  11. try:
  12. print("尝试解压缩...")
  13. data = gzip.decompress(data)
  14. print("解压完毕")
  15. except:
  16. print("未经压缩,无需解压")
  17. return data
  18. def getXSRF(data):
  19. cer = re.compile('name=\"_xsrf\" value=\"(.*)\"',flags = 0)
  20. strlist = cer.findall(data)
  21. return strlist[0]
  22. def getOpener(head):
  23. # cookies 处理
  24. cj = http.cookiejar.CookieJar()
  25. pro = urllib.request.HTTPCookieProcessor(cj)
  26. opener = urllib.request.build_opener(pro)
  27. header = []
  28. for key,value in head.items():
  29. elem = (key,value)
  30. header.append(elem)
  31. opener.addheaders = header
  32. return opener
  33. # header信息可以通过firebug获得
  34. header = {
  35. 'Connection': 'Keep-Alive',
  36. 'Accept': 'text/html, application/xhtml+xml, */*',
  37. 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
  38. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
  39. 'Accept-Encoding': 'gzip, deflate',
  40. 'Host': 'www.zhihu.com',
  41. 'DNT': '1'
  42. }
  43. url = 'http://www.zhihu.com/'
  44. opener = getOpener(header)
  45. op = opener.open(url)
  46. data = op.read()
  47. data = ungzip(data)
  48. _xsrf = getXSRF(data.decode())
  49. url += "login/email"
  50. email = "登录账号"
  51. password = "登录密码"
  52. postDict = {
  53. '_xsrf': _xsrf,
  54. 'email': email,
  55. 'password': password,
  56. 'rememberme': 'y'
  57. }
  58. postData = urllib.parse.urlencode(postDict).encode()
  59. op = opener.open(url,postData)
  60. data = op.read()
  61. data = ungzip(data)
  62. print(data.decode())


4、http错误

  1. import urllib.request
  2. req = urllib.request.Request('http://www.lz881228.blog.163.com ')
  3. try:
  4. urllib.request.urlopen(req)
  5. except urllib.error.HTTPError as e:
  6. print(e.code)
  7. print(e.read().decode("utf8"))

5、异常处理

  1. from urllib.request import Request, urlopen
  2. from urllib.error import URLError, HTTPError
  3. req = Request("http://www.abc.com /")
  4. try:
  5. response = urlopen(req)
  6. except HTTPError as e:
  7. print('The server couldn't fulfill the request.')
  8. print('Error code: ', e.code)
  9. except URLError as e:
  10. print('We failed to reach a server.')
  11. print('Reason: ', e.reason)
  12. else:
  13. print("good!")
  14. print(response.read().decode("utf8"))

6、http认证

  1. import urllib.request
  2. # create a password manager
  3. password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
  4. # Add the username and password.
  5. # If we knew the realm, we could use it instead of None.
  6. top_level_url = "https://www.111cn.net /"
  7. password_mgr.add_password(None, top_level_url, 'rekfan', 'xxxxxx')
  8. handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
  9. # create "opener" (OpenerDirector instance)
  10. opener = urllib.request.build_opener(handler)
  11. # use the opener to fetch a URL
  12. a_url = "https://www.111cn.net /"
  13. x = opener.open(a_url)
  14. print(x.read())
  15. # Install the opener.
  16. # Now all calls to urllib.request.urlopen use our opener.
  17. urllib.request.install_opener(opener)
  18. a = urllib.request.urlopen(a_url).read().decode('utf8')
  19. print(a)

7、使用代理

  1. import urllib.request
  2. proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
  3. opener = urllib.request.build_opener(proxy_support)
  4. urllib.request.install_opener(opener)
  5. a = urllib.request.urlopen("http://www.baidu.com ").read().decode("utf8")
  6. print(a)

8、超时

  1. import socket
  2. import urllib.request
  3. # timeout in seconds
  4. timeout = 2
  5. socket.setdefaulttimeout(timeout)
  6. # this call to urllib.request.urlopen now uses the default timeout
  7. # we have set in the socket module
  8. req = urllib.request.Request('http://www.111cn.net /')
  9. a = urllib.request.urlopen(req).read()
  10. print(a)



声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/995346
推荐阅读
相关标签
  

闽ICP备14008679号