赞
踩
- # 1. urlopen()
- # import urllib.request
- # response= urllib.request.urlopen('https://www.python.org')
- # print(response.read().decode ('utf-8'))
-
- # 1.1 type()
- import urllib.request
- response = urllib.request.urlopen('https://www.baidu.com')
- print(type(response))
- # <class 'http.client.HTTPResponse'>
-
- # 2 getheaders()
- import urllib.request
- response = urllib.request.urlopen('https://www.python.org')
- print(response.status)
- print(response.getheaders())
- print(response.getheader('Server'))
-
- # 200
- # [('Connection', 'close'), ('Content-Length', '49928'), ('Server', 'nginx')]
- # nginx
-
- # 3 data参数 parse函数
- import urllib.parse
- import urllib.request
- # # 传递了一个参数 word ,值是 hello 它需要被转码成 bytes (字节流)类型
- data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8')
- response = urllib.request.urlopen('http://httpbin.org/post', data=data)
- print(response.read())
-
- # 4 timeout 参数
- import urllib.request
- #
- response = urllib.request.urlopen('http://httpbin.org/post', timeout=1)
- print(response.read())
-
- # 5 request方法
- import urllib.request
- request = urllib.request.Request('https://python.org')
- response = urllib.request.urlopen(request)
- print(response.read().decode('utf-8'))
-
- from urllib import request, parse
- #
- url = 'http://httpbin.org/post'
- headers = {
- 'User-Agent': 'Mozilla/4.0 (compatible; MSIE S. S; Windows NT)',
- 'Host': 'httpbin.org' }
- dict = {
- 'name': 'lle'
- }
- data = bytes(parse.urlencode(dict), encoding='utf8')
- req = request.Request(url=url, data=data, headers=headers, method='POST')
- response = request.urlopen(req)
- print(response.read().decode('utf-8'))
-
- # 6
- # HITPDefaultErrorHandler :用于处理 响应错误,错误都会抛出 HTTP Error 类型的异常
- # HTTPRedirectHandler :用于处理重定向
- # HTTPCookieProcessor 用于处理 ki es
- # ProxyHandler :用于设置代理 默认 理为空
- # HπPPasswordMgr :用于管理密码,它维护了用户名和密码的表
- # HTTPBasicAut
-
- from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
- from urllib.error import URLError
- #
- username = 'username'
- password = 'password'
- url = 'http://localhost:sooo/'
- p = HTTPPasswordMgrWithDefaultRealm()
- p.add_password(None, url, username, password)
- auth_handler = HTTPBasicAuthHandler(p)
- opener = build_opener(auth_handler)
- try:
- result = opener.open(url)
- html = result.read().decode('utf8')
- print(html)
- except URLError as e:
- print(e.reason)
-
- # 7 代理配置
- from urllib.error import URLError
- from urllib.request import ProxyHandler, build_opener
- #
- proxy_handler = ProxyHandler({
- ' http': 'http://127.0.0.1:9743 ',
- ' https': 'https://127.0.0.1:9743 '
- })
- opener = build_opener(proxy_handler)
- try:
- response = opener.open('https://www.baidu.com')
- print(response.read().decode('utf-8'))
- except URLError as e:
- print(e.reason)
-
- # 8 Cookies
- import http.cookiejar
- import urllib.request
- #
- cookie = http.cookiejar.CookieJar()
- handler = urllib.request.HTTPCookieProcessor(cookie)
- opener = urllib.request.build_opener(handler)
- response = opener.open('http://www.baidu.com')
- for item in cookie:
- print(item.name+'='+item.value)
-
- # 9 解析链接 # urlparse 使用
- # ,://前面的就是 scheme ,代表协议;第一个/符号前面便是 netloc ,即域名,后面是 path ,即访
- # 问路径;分号;前面是 params ,代表参数;问号?后面是查询条件 query 般用作 GET 类型的 URL;
- # 井号#后面是锚点,用于直接定位页面内部的下拉位置
- from urllib.parse import urlparse
- #
- result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
- print(type(result), result)
- # <class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
-
- from urllib.parse import urlparse
- #
- result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
- print(result)
- ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')
-
-
- from urllib.parse import urlparse
- result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False)
- print(result.scheme, result[0], result.netloc, result[1], sep='\n')
- # http
- # http
- # www.baidu.com
- # www.baidu.com
-
- # 10 urlunparse 用法
- from urllib.parse import urlunparse
- #
- data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
- print(urlunparse(data))
- # http://www.baidu.com/index.html;user?a=6#comment
-
-
- # 11 urlsplit() 用法
- from urllib.request import urlsplit
- result=urlsplit('http://www.baidu.com/index.html;user?id=5#comment')
- print(result)
- SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')
-
- from urllib.parse import urlsplit
- #
- result = urlsplit('http://www.baidu.com/index.html;user?id=S#comment')
- print(result.scheme, result[0])
- # http http
-
- # 12 urlunsplit
- from urllib.parse import urlunsplit
- data = ['http', 'www.baidu.com', 'index.html', 'a=6', 'comment']
- print(urlunsplit(data))
- # # http://www.baidu.com/index.html?a=6#comment
-
- # 13 urlencode用法
- from urllib.parse import urlencode
- params={
- 'name':'germey',
- 'age':22
- }
- base_url ='http://www.baidu.com?'
- url=base_url +urlencode(params)
- print(url)
-
- # http://www.baidu.com?name=germey&age=22
-
- # 14 parse_qs()
- from urllib.parse import parse_qs
- query='name=germey&age=22'
- print(parse_qs(query))
- # {'name': ['germey'], 'age': ['22']}
-
- # 15 parse_qsl()
- from urllib.parse import parse_qsl
- query='name=germey&age=22'
- print(parse_qsl(query))
- # [('name', 'germey'), ('age', '22')]
-
- # 16 quote()
- from urllib.parse import quote
- keyword='壁纸'
- url='https://www.baidu.com/s?wd'+quote(keyword)
- print(url)
- # https://www.baidu.com/s?wd%E5%A3%81%E7%BA%B8
-
- # 17 unquote()
- from urllib.parse import unquote
- # keyword='壁纸'
- url='https://www.baidu.com/s?wd='+unquote(keyword)
- print(url)
- https://www.baidu.com/s?wd=壁纸
-
- # set_url(): 用来设置 ro bots txt 文件的链接 如果在创建 RobotFileParser 对象时传入了链
- # 接,那么就不需要再使用这个方法设置了
- # read(): 读取 robots txt 文件并进行分 注意,这个方法执行一个读取和分析操作,如果不
- # 调用这个方法 接下来的判断都会为 False ,所以一定记得调用这个方法 这个方法不会返
- # 回任何内容,但是执行了读取操作
- # parse(): 用来解析 ro bots. txt 文件,传人的参数是 robots txt 某些行的内 ,它会按照 robots txt
- # 的语法规则来分析这些内容
- # can_fetch(): 该方法传人两个参数 第一个是 Use r-age nt ,第二个是要抓取的 URL 返回的
- # 内容是该搜索引擎是否可以抓取这个 URL ,返回结果是 True False a
- # mtime(): 返回的是上次抓取和分析 robots.txt 的时间,这对于长时间分析和抓取的搜索爬虫
- # 很有必要的,你可能需要定期检查来抓取最新的 robots.txt
- # modified() :它同样对长时间分析和抓取的搜索爬虫很有帮助,将当前时间设置为上次抓取
- # 和分析 robots.txt 的时间
-
- from urllib.robotparser import RobotFileParser
- #
- rp=RobotFileParser()
- rp.set_url('http://www.jianshu.com/robots.txt')
- rp.read()
- print(rp.can_fetch('*','http://www.jianshu.com/p/b67554025d7d'))
- print(rp.can_fetch('*','http://www.jianshu.com/search?q=python&page=1&type=collections'))
- # False
- # False
-
- from urllib.robotparser import RobotFileParser
- from urllib.request import urlopen
- rp=RobotFileParser()
- rp.parse(urlopen('http://www.jianshu.com/robots.txt').read().decode('utf-8').split('\n'))
- print(rp.can_fetch('*','http://www.jianshu.com/p/b67554025d7d'))
- print(rp.can_fetch('*','http://www.jianshu.com/search?q=python&page=1&type=collections'))
代码摘抄之《Python 3网络爬虫开发实战》
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。