赞
踩
目录
简单的用python的一些库爬取豆瓣读书top250的书籍。
- import requests
- import re
- import time
- ''' 获取一页的数据 '''
- def get_one_page(url):
- #添加User-Agent
- headers={
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
- }
- #发送请求,并且将请求头加上
- response = requests.get(url,headers=headers)
- #获取当前页面的编码方式并设置
- response.encoding = response.apparent_encoding
- #获取页面内容
- html = response.text
- return html
- ''' 获取书籍 '''
- def get_books(html,book_list):
- #获取所有书籍
- books_all=re.findall(r'<tr class="item">(.*?)</tr>',html,re.S)
-
- for book_one in books_all:
- #处理所需数据
- #书名
- book_Name=re.search(r'title="(.*?)"',book_one,re.S).group(1)
- #作者名
- book_author=re.search(r'<p class="pl">(.*?) /',book_one,re.S).group(1)
- #评分
- book_rating_nums=re.search(r'class="rating_nums">(.*?)</span>',book_one,re.S).group(1)
- #简介
- #获取到的简介可能为空
- book_inq=re.search(r'<span class="inq">(.*?)</span>',book_one,re.S)
- #判断是否为空
- if(book_inq!=None):
- book_inq=book_inq.group(1)
- else:
- book_inq="暂无简介"
-
- book={"书名":book_Name,"作者":book_author,"评分":book_rating_nums,"简介":book_inq}
- #创建一个列表存储书籍
- book_list.append(book)
- #初始页面
- base_url="https://book.douban.com/top250"
- #存放每一本书籍信息
- book_list=[]
- #遍历所有页面
- for i in range(0,250,25):
- url=base_url+"?start="+str(i)
- print(f"正在爬取:{url}")
- html = get_one_page(url)
- get_books(html,book_list)
- time.sleep(2)
-
- for book in book_list:
- print(book)
- import requests
- import re
- import time
-
- ''' 获取一页的数据 '''
- def get_one_page(url):
- #添加User-Agent
- headers={
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
- }
- #发送请求,并且将请求头加上
- response = requests.get(url,headers=headers)
- #获取当前页面的编码方式并设置
- response.encoding = response.apparent_encoding
- #获取页面内容
- html = response.text
- return html
-
- ''' 获取书籍 '''
- def get_books(html,book_list):
- #获取所有书籍
- books_all=re.findall(r'<tr class="item">(.*?)</tr>',html,re.S)
-
- for book_one in books_all:
- #处理所需数据
- #书名
- book_Name=re.search(r'title="(.*?)"',book_one,re.S).group(1)
- #作者名
- book_author=re.search(r'<p class="pl">(.*?) /',book_one,re.S).group(1)
- #评分
- book_rating_nums=re.search(r'class="rating_nums">(.*?)</span>',book_one,re.S).group(1)
- #简介
- #获取到的简介可能为空
- book_inq=re.search(r'<span class="inq">(.*?)</span>',book_one,re.S)
- #判断是否为空
- if(book_inq!=None):
- book_inq=book_inq.group(1)
- else:
- book_inq="暂无简介"
-
- book={"书名":book_Name,"作者":book_author,"评分":book_rating_nums,"简介":book_inq}
- #创建一个列表存储书籍
- book_list.append(book)
-
- #初始页面
- base_url="https://book.douban.com/top250"
- #存放每一本书籍信息
- book_list=[]
- #遍历所有页面
- for i in range(0,250,25):
- url=base_url+"?start="+str(i)
- print(f"正在爬取:{url}")
- html = get_one_page(url)
- get_books(html,book_list)
- time.sleep(2)
-
- for book in book_list:
- print(book)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。