赞
踩
注意:爬取时,只是在类型转变时,进行了随机延时,在爬取具体书籍评论量时,没有进行延迟处理。别爬太快,防止ip被封。
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- ''' @File : douban_spider.py @Contact : raogx.vip@hotmail.com @License : (C)Copyright 2019-2020, Liugroup-NLPR-CASIA @Modify Time @Author @Version @Desciption ------------ ------- -------- ----------- 2020/4/2 16:00 ligang 1.0 None 加 V 交流:15188607997 ''' import time import urllib.request from concurrent.futures import ThreadPoolExecutor import numpy import numpy as np import requests from bs4 import BeautifulSoup from openpyxl import Workbook # Some User Agents Chrome/Edge/IE User_Agents = [ {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'} ] def book_spider(book_tag): """ 进行书籍详情爬取 :param book_tag: 书籍类型 :return: 书籍信息 """ book_list = list() try_times = 0 # 起始页:0 结束页:5 for page_num in range(0, 5): url = 'https://www.douban.com/tag/' + \ urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15) time.sleep(numpy.random.rand() * 5) try: source_code = requests.get( url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text plain_text = str(source_code) except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error: print(error) continue soup = BeautifulSoup(plain_text, 'lxml') # lxml module is required. list_soup = soup.find('div', attrs={'class': 'mod book-list'}) try_times += 1 if list_soup is None and try_times < 200: continue elif list_soup is None or len(list_soup) <= 1: break # 遍历取出当前页所有的书籍信息 for book_info in list_soup.findAll('dd'): title = book_info.find( 'a', attrs={ 'class': 'title'}).string.strip() desc = book_info.find( 'div', attrs={ 'class': 'desc'}).string.strip() desc_list = desc.split('/') # 获取每本书的url地址 book_url = book_info.find( 'a', attrs={'class': 'title'}).get('href') # 获取作者 try: author_info = '/'.join(desc_list[0:-3]) except BaseException: author_info = ' 暂无' # 获取出版社 try: pub_info = '/'.join(desc_list[-3:-2]) except BaseException: pub_info = ' 暂无' # 获取出版日期 try: pub_date = desc_list[-2:-1][0] except BaseException: pub_date = ' 暂无' # 获取售价 try: price = desc_list[-1] except BaseException: price = ' 暂无' # 获取评分 try: rating = book_info.find('span', {'class': 'rating_nums'}).string.strip() except BaseException: rating = '0.0' # 获取每本书的详细信息 try: people_num = get_people_num(book_url) people_num = people_num.strip('人评价') except BaseException: people_num='0.0' book_list.append([title, rating, people_num, author_info, pub_info, pub_date, price]) print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag)) return book_list # 访问每本书的url地址,以获取详细信息 def get_people_num(url): """ 爬取书籍评论数 :param url: 具体的某本书籍的url地址 :return: 评论数 """ source_code = requests.get( url, headers=User_Agents[np.random.randint(0,len(User_Agents))], timeout=50).text plain_text = str(source_code) soup = BeautifulSoup(plain_text,'lxml') people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip() return people_num def fetch_list(book_tag, book_dicts): """ 书籍爬取,并通过评分,进行排序 :param book_tag: 书籍类型 :param book_dicts: 字典格式返回书籍信息 :return: """ book_list = book_spider(book_tag) # 按照评分进行排序 book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_dicts[book_tag] = book_list def run_spider(book_tag_lists): """ 创建线程,进行书籍爬取 :param book_tag_lists: 所有的需要爬取的书籍类型 :return: 所有的数据详情信息 """ # 存储所有的书籍信息: 字典格式:'文化':[], '算法':[] book_dicts = dict() # 运用多线程进行爬取,每一类型创建一个线程 with ThreadPoolExecutor(max_workers=len(book_tag_lists)) as executor: for book_tag in book_tag_lists: executor.submit(fetch_list, book_tag, book_dicts) return book_dicts def output_to_excel(book_dicts, book_tag_lists): """ 所有的详情信息写入到Excel中,按照类型分sheet页 :param book_dicts: 爬取的书籍信息 :param book_tag_lists: 书籍类型列表 :return: """ wb = Workbook(write_only=True) for book_tag in book_tag_lists: ws = wb.create_sheet(title=book_tag) ws.append(['序号', '书名', '评分', '评论人数','作者/译者', '出版社', '出版日期', '售价', '评论']) for index, book_list in enumerate(book_dicts[book_tag], start=1): ws.append([index, book_list[0], book_list[1], book_list[2], book_list[3], book_list[4], book_list[5], book_list[6]],) file_name = 'Book-List' # 拼接保存文件名字 for i in range(len(book_tag_lists)): file_name += ('-' + book_tag_lists[i]) file_name += '.xlsx' wb.save(file_name) if __name__ == '__main__': # 再次添加需要爬取的书籍类型 book_tag_lists = ['Python', '算法'] book_dicts = run_spider(book_tag_lists) output_to_excel(book_dicts, book_tag_lists) print("----All Done----")
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- ''' @File : douban_spider.py @Contact : raogx.vip@hotmail.com @License : (C)Copyright 2019-2020, Liugroup-NLPR-CASIA @Modify Time @Author @Version @Desciption ------------ ------- -------- ----------- 2020/4/2 16:00 ligang 1.0 None 加 V 交流:15188607997 ''' import time import urllib.request from concurrent.futures import ThreadPoolExecutor import numpy import numpy as np import requests from bs4 import BeautifulSoup from openpyxl import Workbook import pymysql # Some User Agents Chrome/Edge/IE User_Agents = [ {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}, {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}, {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'} ] def book_spider(book_tag): """ 进行书籍详情爬取 :param book_tag: 书籍类型 :return: 书籍信息 """ book_list = list() try_times = 0 # 起始页:0 结束页:5 for page_num in range(0, 5): url = 'https://www.douban.com/tag/' + \ urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15) time.sleep(numpy.random.rand() * 5) try: source_code = requests.get( url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text plain_text = str(source_code) except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error: print(error) continue soup = BeautifulSoup(plain_text, 'lxml') # lxml module is required. list_soup = soup.find('div', attrs={'class': 'mod book-list'}) try_times += 1 if list_soup is None and try_times < 200: continue elif list_soup is None or len(list_soup) <= 1: break # 遍历取出当前页所有的书籍信息 for book_info in list_soup.findAll('dd'): title = book_info.find( 'a', attrs={ 'class': 'title'}).string.strip() desc = book_info.find( 'div', attrs={ 'class': 'desc'}).string.strip() desc_list = desc.split('/') # 获取每本书的url地址 book_url = book_info.find( 'a', attrs={'class': 'title'}).get('href') # 获取作者 try: author_info = '/'.join(desc_list[0:-3]) except BaseException: author_info = ' 暂无' # 获取出版社 try: pub_info = '/'.join(desc_list[-3:-2]) except BaseException: pub_info = ' 暂无' # 获取出版日期 try: pub_date = desc_list[-2:-1][0] except BaseException: pub_date = ' 暂无' # 获取售价 try: price = desc_list[-1] except BaseException: price = ' 暂无' # 获取评分 try: rating = book_info.find('span', {'class': 'rating_nums'}).string.strip() except BaseException: rating = '0.0' # 获取每本书的详细信息 try: people_num = get_people_num(book_url) people_num = people_num.strip('人评价') except BaseException: people_num='0.0' print((title, rating, people_num, author_info, pub_info, pub_date, price, book_tag)) insert_data((title, rating, people_num, author_info, pub_info, pub_date, price, book_tag)) print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag)) return book_list # 访问每本书的url地址,以获取详细信息 def get_people_num(url): """ 爬取书籍评论数 :param url: 具体的某本书籍的url地址 :return: 评论数 """ source_code = requests.get( url, headers=User_Agents[np.random.randint(0,len(User_Agents))], timeout=50).text plain_text = str(source_code) soup = BeautifulSoup(plain_text,'lxml') people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip() return people_num def fetch_list(book_tag, book_dicts): """ 书籍爬取,并通过评分,进行排序 :param book_tag: 书籍类型 :param book_dicts: 字典格式返回书籍信息 :return: """ book_list = book_spider(book_tag) # 按照评分进行排序 book_list = sorted(book_list, key=lambda x: x[1], reverse=True) book_dicts[book_tag] = book_list def run_spider(book_tag_lists): """ 创建线程,进行书籍爬取 :param book_tag_lists: 所有的需要爬取的书籍类型 :return: 所有的数据详情信息 """ # 存储所有的书籍信息: 字典格式:'文化':[], '算法':[] book_dicts = dict() # 运用多线程进行爬取,每一类型创建一个线程 with ThreadPoolExecutor(max_workers=len(book_tag_lists)) as executor: for book_tag in book_tag_lists: executor.submit(fetch_list, book_tag, book_dicts) return book_dicts def connect_mysql(): global conn # 打开数据库连接 IP 用户名 密码 库名 conn = pymysql.connect('localhost', 'root', '123456', 'shuping') def insert_data(data=''): # 使用cursor()方法创建一个游标对象 cursor = conn.cursor() # SQL语句:向数据表中插入数据 sql = """insert into doubanbook(title, rating, people_num, author_info, pub_info, pub_date, price, booktype) value(%s, %s, %s, %s, %s, %s, %s, %s)""" # 异常处理 # try: # 执行SQL语句 cursor.execute(sql, data) # 提交事务到数据库执行 conn.commit() # 事务是访问和更新数据库的一个程序执行单元 # except: # # 如果发生错误则执行回滚操作 # conn.rollback() def close_mysql(): # 关闭数据库连接 conn.close() if __name__ == '__main__': connect_mysql() # 再次添加需要爬取的书籍类型 book_tag_lists = ['Python', '算法'] book_dicts = run_spider(book_tag_lists) print("----All Done----")
结果:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。