赞
踩
需求:通过requests向百度首页发送请求,获取百度首页数据
response = requests.get("https://www.baidu.com/")
response的常用属性: resquest详细用法
response.text
response.content (推荐)
获取网页源码的通用方式:
以上三种方法从前往后尝试,能够100%的解决所有网页编码问题
推荐使用:response.content.deocde() 的方式获取相应的html页面
import requests
url = "https://www.baidu.com/img/bd_logo1.png?where=super"
response = requests.get(url)
with open("baidu.png", "wb") as f: # "w"可以写入str "wb"可以写入str 和 byte
f.write(response.content) # 直接以二进制形式写入 不需要解码
对比浏览器上百度首页的网页源码和代码请求的百度首页源码,有什么不同?
代码中百度首页的源码非常少,为什么?
模拟浏览器,欺骗浏览器,获取和浏览器一致的内容
headers = {'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
request.get(url, headers = headers)
错误的例1:https://mp.csdn.net/mdeditor/93709582
正确的例2:http://www.baidu.com/s?wd=python&c=b
kw = {"wd": "长城"}
requests.get(url, parmas=kw)
在url地址中,很多参数是没用的,比如百度搜索的url地址,其中参数只有一个字段有用,其他的都可以删除
对应的,在后续爬虫中,遇到很多参数的url地址都可以尝试删除参数
import requests
query_string = input(":")
params = {"wd": query_string}
url = "https://www.baidu.com/s?"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
response = requests.get(url, params=params, headers=headers)
print(response.request.url) # https://www.baidu.com/s?wd=python
print(response.content.decode())
print(response.status_code)
import requests
query_string = input(":")
# params = {"wd": query_string}
url = "https://www.baidu.com/s?wd=%s" % query_string
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
response = requests.get(url, headers=headers)
print(response.request.url)
print(response.content.decode())
print(response.status_code)
format方法用法:
a = "你好{}".format(",哈哈")
print(a) # 你好,哈哈
b = "{}你好{}".format([1, 2, 3], ",哈哈")
print(b) # [1, 2, 3]你好,哈哈
import requests
url = "https://www.baidu.com/s?wd={}".format("python")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
response = requests.get(url, headers=headers)
print(response.request.url)
print(response.content.decode())
print(response.status_code)
优秀的代码
import requests class Spider: def __init__(self, name): self.name = name self.url_temp = "https://tieba.baidu.com/f?kw=" + name + "&ie=utf-8&pn={}" self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} def get_url_list(self): # 1 构造url列表 return [self.url_temp.format(i*50) for i in range(1000)] def parse_url(self, url): # 2 发送请求 获取响应 response = requests.get(url, headers=self.headers) return response def save_html_str(self, html_str, page_num): # 3 保存 file_path = "D:/ddd/{}吧_第{}页".format(self.name, page_num) with open(file_path, "w", encoding="utf-8") as f: f.write(html_str) def run(self): # 1 构造url列表 url_list = self.get_url_list() # 2 发送请求 获取响应 for url in url_list: html_str = self.parse_url(url).content.decode() # 3 保存 page_num = url_list.index(url) + 1 self.save_html_str(html_str, page_num) def main(): name = input("请输入要爬取的贴吧:") tieba_spider = Spider(name) tieba_spider.run() if __name__ == "__main__": main()
我自己写的垃圾代码
import requests class Spider: def __init__(self, name): self.name = name url_temp = "https://tieba.baidu.com/f?kw=" + self.name + "&ie=utf-8&pn={}" self.url_list = [url_temp.format(i*50) for i in range(1000)] self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} def work(self): for url in self.url_list: response = requests.get(url, headers=self.headers) file_path = "D:/ccc/{}吧_第{}页".format(self.name, self.url_list.index(url) + 1) with open(file_path, "w", encoding="utf-8") as f: f.write(response.content.decode()) def main(): name = input("请输入要爬的贴吧:") a = Spider(name) a.work() if __name__ == "__main__": main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。