赞
踩
urllib是python的一个获取url(Uniform Resource Locators,统一资源定址符)了,可以利用它来抓取远程的数据进行保存,本文整理了一些关于urllib使用中的一些关于header,代理,超时,认证,异常处理处理方法。
1、简单读取网页信息
- import urllib.request
- response = urllib.request.urlopen('http://python.org/')
- html = response.read()
- import urllib.request
- req = urllib.request.Request('http://python.org/')
- response = urllib.request.urlopen(req)
- the_page = response.read()
- '''
- Created on 2016年5月31日
- @author: gionee
- '''
- import gzip
- import re
- import urllib.request
- import urllib.parse
- import http.cookiejar
-
- def ungzip(data):
- try:
- print("尝试解压缩...")
- data = gzip.decompress(data)
- print("解压完毕")
- except:
- print("未经压缩,无需解压")
-
- return data
-
- def getXSRF(data):
- cer = re.compile('name=\"_xsrf\" value=\"(.*)\"',flags = 0)
- strlist = cer.findall(data)
- return strlist[0]
-
- def getOpener(head):
- # cookies 处理
- cj = http.cookiejar.CookieJar()
- pro = urllib.request.HTTPCookieProcessor(cj)
- opener = urllib.request.build_opener(pro)
- header = []
- for key,value in head.items():
- elem = (key,value)
- header.append(elem)
- opener.addheaders = header
- return opener
- # header信息可以通过firebug获得
- header = {
- 'Connection': 'Keep-Alive',
- 'Accept': 'text/html, application/xhtml+xml, */*',
- 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
- 'Accept-Encoding': 'gzip, deflate',
- 'Host': 'www.zhihu.com',
- 'DNT': '1'
- }
-
- url = 'http://www.zhihu.com/'
- opener = getOpener(header)
- op = opener.open(url)
- data = op.read()
- data = ungzip(data)
- _xsrf = getXSRF(data.decode())
-
- url += "login/email"
- email = "登录账号"
- password = "登录密码"
- postDict = {
- '_xsrf': _xsrf,
- 'email': email,
- 'password': password,
- 'rememberme': 'y'
- }
- postData = urllib.parse.urlencode(postDict).encode()
- op = opener.open(url,postData)
- data = op.read()
- data = ungzip(data)
-
- print(data.decode())
- import urllib.request
- req = urllib.request.Request('http://www.lz881228.blog.163.com ')
- try:
- urllib.request.urlopen(req)
- except urllib.error.HTTPError as e:
- print(e.code)
- print(e.read().decode("utf8"))
- from urllib.request import Request, urlopen
- from urllib.error import URLError, HTTPError
-
- req = Request("http://www.abc.com /")
- try:
- response = urlopen(req)
- except HTTPError as e:
- print('The server couldn't fulfill the request.')
- print('Error code: ', e.code)
- except URLError as e:
- print('We failed to reach a server.')
- print('Reason: ', e.reason)
- else:
- print("good!")
- print(response.read().decode("utf8"))
- import urllib.request
-
- # create a password manager
- password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
-
- # Add the username and password.
- # If we knew the realm, we could use it instead of None.
- top_level_url = "https://www.111cn.net /"
- password_mgr.add_password(None, top_level_url, 'rekfan', 'xxxxxx')
-
- handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
-
- # create "opener" (OpenerDirector instance)
- opener = urllib.request.build_opener(handler)
-
- # use the opener to fetch a URL
- a_url = "https://www.111cn.net /"
- x = opener.open(a_url)
- print(x.read())
-
- # Install the opener.
- # Now all calls to urllib.request.urlopen use our opener.
- urllib.request.install_opener(opener)
- a = urllib.request.urlopen(a_url).read().decode('utf8')
-
- print(a)
- import urllib.request
-
- proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
- opener = urllib.request.build_opener(proxy_support)
- urllib.request.install_opener(opener)
-
- a = urllib.request.urlopen("http://www.baidu.com ").read().decode("utf8")
- print(a)
- import socket
- import urllib.request
-
- # timeout in seconds
- timeout = 2
- socket.setdefaulttimeout(timeout)
-
- # this call to urllib.request.urlopen now uses the default timeout
- # we have set in the socket module
- req = urllib.request.Request('http://www.111cn.net /')
- a = urllib.request.urlopen(req).read()
- print(a)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。