当前位置:   article > 正文

Python 3网络爬虫之urllib用法_python3 urllib

python3 urllib
  1. # 1. urlopen()
  2. # import urllib.request
  3. # response= urllib.request.urlopen('https://www.python.org')
  4. # print(response.read().decode ('utf-8'))
  5. # 1.1 type()
  6. import urllib.request
  7. response = urllib.request.urlopen('https://www.baidu.com')
  8. print(type(response))
  9. # <class 'http.client.HTTPResponse'>
  10. # 2 getheaders()
  11. import urllib.request
  12. response = urllib.request.urlopen('https://www.python.org')
  13. print(response.status)
  14. print(response.getheaders())
  15. print(response.getheader('Server'))
  16. # 200
  17. # [('Connection', 'close'), ('Content-Length', '49928'), ('Server', 'nginx')]
  18. # nginx
  19. # 3 data参数 parse函数
  20. import urllib.parse
  21. import urllib.request
  22. # # 传递了一个参数 word ,值是 hello 它需要被转码成 bytes (字节流)类型
  23. data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8')
  24. response = urllib.request.urlopen('http://httpbin.org/post', data=data)
  25. print(response.read())
  26. # 4 timeout 参数
  27. import urllib.request
  28. #
  29. response = urllib.request.urlopen('http://httpbin.org/post', timeout=1)
  30. print(response.read())
  31. # 5 request方法
  32. import urllib.request
  33. request = urllib.request.Request('https://python.org')
  34. response = urllib.request.urlopen(request)
  35. print(response.read().decode('utf-8'))
  36. from urllib import request, parse
  37. #
  38. url = 'http://httpbin.org/post'
  39. headers = {
  40. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE S. S; Windows NT)',
  41. 'Host': 'httpbin.org' }
  42. dict = {
  43. 'name': 'lle'
  44. }
  45. data = bytes(parse.urlencode(dict), encoding='utf8')
  46. req = request.Request(url=url, data=data, headers=headers, method='POST')
  47. response = request.urlopen(req)
  48. print(response.read().decode('utf-8'))
  49. # 6
  50. # HITPDefaultErrorHandler :用于处理 响应错误,错误都会抛出 HTTP Error 类型的异常
  51. # HTTPRedirectHandler :用于处理重定向
  52. # HTTPCookieProcessor 用于处理 ki es
  53. # ProxyHandler :用于设置代理 默认 理为空
  54. # HπPPasswordMgr :用于管理密码,它维护了用户名和密码的表
  55. # HTTPBasicAut
  56. from urllib.request import HTTPPasswordMgrWithDefaultRealm, HTTPBasicAuthHandler, build_opener
  57. from urllib.error import URLError
  58. #
  59. username = 'username'
  60. password = 'password'
  61. url = 'http://localhost:sooo/'
  62. p = HTTPPasswordMgrWithDefaultRealm()
  63. p.add_password(None, url, username, password)
  64. auth_handler = HTTPBasicAuthHandler(p)
  65. opener = build_opener(auth_handler)
  66. try:
  67. result = opener.open(url)
  68. html = result.read().decode('utf8')
  69. print(html)
  70. except URLError as e:
  71. print(e.reason)
  72. # 7 代理配置
  73. from urllib.error import URLError
  74. from urllib.request import ProxyHandler, build_opener
  75. #
  76. proxy_handler = ProxyHandler({
  77. ' http': 'http://127.0.0.1:9743 ',
  78. ' https': 'https://127.0.0.1:9743 '
  79. })
  80. opener = build_opener(proxy_handler)
  81. try:
  82. response = opener.open('https://www.baidu.com')
  83. print(response.read().decode('utf-8'))
  84. except URLError as e:
  85. print(e.reason)
  86. # 8 Cookies
  87. import http.cookiejar
  88. import urllib.request
  89. #
  90. cookie = http.cookiejar.CookieJar()
  91. handler = urllib.request.HTTPCookieProcessor(cookie)
  92. opener = urllib.request.build_opener(handler)
  93. response = opener.open('http://www.baidu.com')
  94. for item in cookie:
  95. print(item.name+'='+item.value)
  96. # 9 解析链接 # urlparse 使用
  97. # ,://前面的就是 scheme ,代表协议;第一个/符号前面便是 netloc ,即域名,后面是 path ,即访
  98. # 问路径;分号;前面是 params ,代表参数;问号?后面是查询条件 query 般用作 GET 类型的 URL;
  99. # 井号#后面是锚点,用于直接定位页面内部的下拉位置
  100. from urllib.parse import urlparse
  101. #
  102. result = urlparse('http://www.baidu.com/index.html;user?id=5#comment')
  103. print(type(result), result)
  104. # <class 'urllib.parse.ParseResult'> ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
  105. from urllib.parse import urlparse
  106. #
  107. result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https')
  108. print(result)
  109. ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=5', fragment='comment')
  110. from urllib.parse import urlparse
  111. result = urlparse('http://www.baidu.com/index.html#comment', allow_fragments=False)
  112. print(result.scheme, result[0], result.netloc, result[1], sep='\n')
  113. # http
  114. # http
  115. # www.baidu.com
  116. # www.baidu.com
  117. # 10 urlunparse 用法
  118. from urllib.parse import urlunparse
  119. #
  120. data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
  121. print(urlunparse(data))
  122. # http://www.baidu.com/index.html;user?a=6#comment
  123. # 11 urlsplit() 用法
  124. from urllib.request import urlsplit
  125. result=urlsplit('http://www.baidu.com/index.html;user?id=5#comment')
  126. print(result)
  127. SplitResult(scheme='http', netloc='www.baidu.com', path='/index.html;user', query='id=5', fragment='comment')
  128. from urllib.parse import urlsplit
  129. #
  130. result = urlsplit('http://www.baidu.com/index.html;user?id=S#comment')
  131. print(result.scheme, result[0])
  132. # http http
  133. # 12 urlunsplit
  134. from urllib.parse import urlunsplit
  135. data = ['http', 'www.baidu.com', 'index.html', 'a=6', 'comment']
  136. print(urlunsplit(data))
  137. # # http://www.baidu.com/index.html?a=6#comment
  138. # 13 urlencode用法
  139. from urllib.parse import urlencode
  140. params={
  141. 'name':'germey',
  142. 'age':22
  143. }
  144. base_url ='http://www.baidu.com?'
  145. url=base_url +urlencode(params)
  146. print(url)
  147. # http://www.baidu.com?name=germey&age=22
  148. # 14 parse_qs()
  149. from urllib.parse import parse_qs
  150. query='name=germey&age=22'
  151. print(parse_qs(query))
  152. # {'name': ['germey'], 'age': ['22']}
  153. # 15 parse_qsl()
  154. from urllib.parse import parse_qsl
  155. query='name=germey&age=22'
  156. print(parse_qsl(query))
  157. # [('name', 'germey'), ('age', '22')]
  158. # 16 quote()
  159. from urllib.parse import quote
  160. keyword='壁纸'
  161. url='https://www.baidu.com/s?wd'+quote(keyword)
  162. print(url)
  163. # https://www.baidu.com/s?wd%E5%A3%81%E7%BA%B8
  164. # 17 unquote()
  165. from urllib.parse import unquote
  166. # keyword='壁纸'
  167. url='https://www.baidu.com/s?wd='+unquote(keyword)
  168. print(url)
  169. https://www.baidu.com/s?wd=壁纸
  170. # set_url(): 用来设置 ro bots txt 文件的链接 如果在创建 RobotFileParser 对象时传入了链
  171. # 接,那么就不需要再使用这个方法设置了
  172. # read(): 读取 robots txt 文件并进行分 注意,这个方法执行一个读取和分析操作,如果不
  173. # 调用这个方法 接下来的判断都会为 False ,所以一定记得调用这个方法 这个方法不会返
  174. # 回任何内容,但是执行了读取操作
  175. # parse(): 用来解析 ro bots. txt 文件,传人的参数是 robots txt 某些行的内 ,它会按照 robots txt
  176. # 的语法规则来分析这些内容
  177. # can_fetch(): 该方法传人两个参数 第一个是 Use r-age nt ,第二个是要抓取的 URL 返回的
  178. # 内容是该搜索引擎是否可以抓取这个 URL ,返回结果是 True False a
  179. # mtime(): 返回的是上次抓取和分析 robots.txt 的时间,这对于长时间分析和抓取的搜索爬虫
  180. # 很有必要的,你可能需要定期检查来抓取最新的 robots.txt
  181. # modified() :它同样对长时间分析和抓取的搜索爬虫很有帮助,将当前时间设置为上次抓取
  182. # 和分析 robots.txt 的时间
  183. from urllib.robotparser import RobotFileParser
  184. #
  185. rp=RobotFileParser()
  186. rp.set_url('http://www.jianshu.com/robots.txt')
  187. rp.read()
  188. print(rp.can_fetch('*','http://www.jianshu.com/p/b67554025d7d'))
  189. print(rp.can_fetch('*','http://www.jianshu.com/search?q=python&page=1&type=collections'))
  190. # False
  191. # False
  192. from urllib.robotparser import RobotFileParser
  193. from urllib.request import urlopen
  194. rp=RobotFileParser()
  195. rp.parse(urlopen('http://www.jianshu.com/robots.txt').read().decode('utf-8').split('\n'))
  196. print(rp.can_fetch('*','http://www.jianshu.com/p/b67554025d7d'))
  197. print(rp.can_fetch('*','http://www.jianshu.com/search?q=python&page=1&type=collections'))

代码摘抄之《Python 3网络爬虫开发实战》

本文内容由网友自发贡献,转载请注明出处:https://www.wpsshop.cn/w/你好赵伟/article/detail/470914
推荐阅读
相关标签
  

闽ICP备14008679号