当前位置:   article > 正文

豆瓣书籍数据爬取

豆瓣书籍数据爬取

爬取书名、评分、评论人数、作者/译者、出版社、出版日期、售价,并保存到Excel和mysql数据库。

技术:运用多线程(每一个书籍类型创建一个线程)加速处理

注意:爬取时,只是在类型转变时,进行了随机延时,在爬取具体书籍评论量时,没有进行延迟处理。别爬太快,防止ip被封。

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
'''
@File    :   douban_spider.py
@Contact :   raogx.vip@hotmail.com
@License :   (C)Copyright 2019-2020, Liugroup-NLPR-CASIA

@Modify Time         @Author    @Version    @Desciption
------------         -------    --------    -----------
2020/4/2  16:00       ligang      1.0         None

加 V 交流:15188607997
'''

import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor

import numpy
import numpy as np
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

# Some User Agents  Chrome/Edge/IE
User_Agents = [
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'},
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'},
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
]

def book_spider(book_tag):
    """
    进行书籍详情爬取
    :param book_tag: 书籍类型
    :return: 书籍信息
    """
    book_list = list()
    try_times = 0
    # 起始页:0   结束页:5
    for page_num in range(0, 5):
        url = 'https://www.douban.com/tag/' + \
              urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
        time.sleep(numpy.random.rand() * 5)
        try:
            source_code = requests.get(
                url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
            plain_text = str(source_code)
        except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
            print(error)
            continue

        soup = BeautifulSoup(plain_text, 'lxml')  # lxml module is required.
        list_soup = soup.find('div', attrs={'class': 'mod book-list'})
        try_times += 1
        if list_soup is None and try_times < 200:
            continue
        elif list_soup is None or len(list_soup) <= 1:
            break

        # 遍历取出当前页所有的书籍信息
        for book_info in list_soup.findAll('dd'):
            title = book_info.find(
                'a', attrs={
                    'class': 'title'}).string.strip()

            desc = book_info.find(
                'div', attrs={
                    'class': 'desc'}).string.strip()
            desc_list = desc.split('/')

            # 获取每本书的url地址
            book_url = book_info.find(
                'a', attrs={'class': 'title'}).get('href')

            # 获取作者
            try:
                author_info = '/'.join(desc_list[0:-3])
            except BaseException:
                author_info = ' 暂无'

            # 获取出版社
            try:
                pub_info = '/'.join(desc_list[-3:-2])
            except BaseException:
                pub_info = ' 暂无'

            # 获取出版日期
            try:
                pub_date = desc_list[-2:-1][0]
            except BaseException:
                pub_date = ' 暂无'

            # 获取售价
            try:
                price = desc_list[-1]
            except BaseException:
                price = ' 暂无'

            # 获取评分
            try:
                rating = book_info.find('span',
                                        {'class': 'rating_nums'}).string.strip()
            except BaseException:
                rating = '0.0'
            # 获取每本书的详细信息
            try:
                people_num = get_people_num(book_url)
                people_num = people_num.strip('人评价')
            except BaseException:
                people_num='0.0'

            book_list.append([title, rating, people_num, author_info, pub_info, pub_date, price])
        print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
    return book_list

# 访问每本书的url地址,以获取详细信息
def get_people_num(url):
    """
    爬取书籍评论数
    :param url: 具体的某本书籍的url地址
    :return: 评论数
    """
    source_code = requests.get(
        url, headers=User_Agents[np.random.randint(0,len(User_Agents))], timeout=50).text
    plain_text = str(source_code)
    soup = BeautifulSoup(plain_text,'lxml')
    people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip()
    return people_num


def fetch_list(book_tag, book_dicts):
    """
    书籍爬取,并通过评分,进行排序
    :param book_tag: 书籍类型
    :param book_dicts: 字典格式返回书籍信息
    :return:
    """
    book_list = book_spider(book_tag)
    # 按照评分进行排序
    book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
    book_dicts[book_tag] = book_list


def run_spider(book_tag_lists):
    """
    创建线程,进行书籍爬取
    :param book_tag_lists: 所有的需要爬取的书籍类型
    :return: 所有的数据详情信息
    """
    # 存储所有的书籍信息:  字典格式:'文化':[],  '算法':[]
    book_dicts = dict()
    # 运用多线程进行爬取,每一类型创建一个线程
    with ThreadPoolExecutor(max_workers=len(book_tag_lists)) as executor:
        for book_tag in book_tag_lists:
            executor.submit(fetch_list, book_tag, book_dicts)
    return book_dicts


def output_to_excel(book_dicts, book_tag_lists):
    """
    所有的详情信息写入到Excel中,按照类型分sheet页
    :param book_dicts: 爬取的书籍信息
    :param book_tag_lists: 书籍类型列表
    :return:
    """
    wb = Workbook(write_only=True)
    for book_tag in book_tag_lists:
        ws = wb.create_sheet(title=book_tag)
        ws.append(['序号', '书名', '评分', '评论人数','作者/译者', '出版社', '出版日期', '售价', '评论'])
        for index, book_list in enumerate(book_dicts[book_tag], start=1):
            ws.append([index, book_list[0], book_list[1], book_list[2], book_list[3], book_list[4], book_list[5],
                       book_list[6]],)

    file_name = 'Book-List'
    # 拼接保存文件名字
    for i in range(len(book_tag_lists)):
        file_name += ('-' + book_tag_lists[i])
    file_name += '.xlsx'
    wb.save(file_name)


if __name__ == '__main__':
    # 再次添加需要爬取的书籍类型
    book_tag_lists = ['Python', '算法']
    book_dicts = run_spider(book_tag_lists)
    output_to_excel(book_dicts, book_tag_lists)
    print("----All Done----")



  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193

结果:

在这里插入图片描述

数据保存到 mysql数据库

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
'''
@File    :   douban_spider.py
@Contact :   raogx.vip@hotmail.com
@License :   (C)Copyright 2019-2020, Liugroup-NLPR-CASIA

@Modify Time         @Author    @Version    @Desciption
------------         -------    --------    -----------
2020/4/2  16:00       ligang      1.0         None

加 V 交流:15188607997
'''

import time
import urllib.request
from concurrent.futures import ThreadPoolExecutor

import numpy
import numpy as np
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
import pymysql

# Some User Agents  Chrome/Edge/IE
User_Agents = [
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'},
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'},
    {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'},
    {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
]

def book_spider(book_tag):
    """
    进行书籍详情爬取
    :param book_tag: 书籍类型
    :return: 书籍信息
    """
    book_list = list()
    try_times = 0
    # 起始页:0   结束页:5
    for page_num in range(0, 5):
        url = 'https://www.douban.com/tag/' + \
              urllib.request.quote(book_tag) + '/book?start=' + str(page_num * 15)
        time.sleep(numpy.random.rand() * 5)
        try:
            source_code = requests.get(
                url, headers=User_Agents[page_num % len(User_Agents)], timeout=50).text
            plain_text = str(source_code)
        except (requests.HTTPError, requests.URLRequired, requests.Timeout, requests.TooManyRedirects) as error:
            print(error)
            continue

        soup = BeautifulSoup(plain_text, 'lxml')  # lxml module is required.
        list_soup = soup.find('div', attrs={'class': 'mod book-list'})
        try_times += 1
        if list_soup is None and try_times < 200:
            continue
        elif list_soup is None or len(list_soup) <= 1:
            break

        # 遍历取出当前页所有的书籍信息
        for book_info in list_soup.findAll('dd'):
            title = book_info.find(
                'a', attrs={
                    'class': 'title'}).string.strip()

            desc = book_info.find(
                'div', attrs={
                    'class': 'desc'}).string.strip()
            desc_list = desc.split('/')

            # 获取每本书的url地址
            book_url = book_info.find(
                'a', attrs={'class': 'title'}).get('href')

            # 获取作者
            try:
                author_info = '/'.join(desc_list[0:-3])
            except BaseException:
                author_info = ' 暂无'

            # 获取出版社
            try:
                pub_info = '/'.join(desc_list[-3:-2])
            except BaseException:
                pub_info = ' 暂无'

            # 获取出版日期
            try:
                pub_date = desc_list[-2:-1][0]
            except BaseException:
                pub_date = ' 暂无'

            # 获取售价
            try:
                price = desc_list[-1]
            except BaseException:
                price = ' 暂无'

            # 获取评分
            try:
                rating = book_info.find('span',
                                        {'class': 'rating_nums'}).string.strip()
            except BaseException:
                rating = '0.0'
            # 获取每本书的详细信息
            try:
                people_num = get_people_num(book_url)
                people_num = people_num.strip('人评价')
            except BaseException:
                people_num='0.0'
            print((title, rating, people_num, author_info, pub_info, pub_date, price, book_tag))
            insert_data((title, rating, people_num, author_info, pub_info, pub_date, price, book_tag))
        print("Downloading Information From Tag: {1} Page: {0} ".format(page_num, book_tag))
    return book_list

# 访问每本书的url地址,以获取详细信息
def get_people_num(url):
    """
    爬取书籍评论数
    :param url: 具体的某本书籍的url地址
    :return: 评论数
    """
    source_code = requests.get(
        url, headers=User_Agents[np.random.randint(0,len(User_Agents))], timeout=50).text
    plain_text = str(source_code)
    soup = BeautifulSoup(plain_text,'lxml')
    people_num = soup.find('div', {'class': 'rating_sum'}).findAll('span')[1].string.strip()
    return people_num


def fetch_list(book_tag, book_dicts):
    """
    书籍爬取,并通过评分,进行排序
    :param book_tag: 书籍类型
    :param book_dicts: 字典格式返回书籍信息
    :return:
    """
    book_list = book_spider(book_tag)
    # 按照评分进行排序
    book_list = sorted(book_list, key=lambda x: x[1], reverse=True)
    book_dicts[book_tag] = book_list


def run_spider(book_tag_lists):
    """
    创建线程,进行书籍爬取
    :param book_tag_lists: 所有的需要爬取的书籍类型
    :return: 所有的数据详情信息
    """
    # 存储所有的书籍信息:  字典格式:'文化':[],  '算法':[]
    book_dicts = dict()
    # 运用多线程进行爬取,每一类型创建一个线程
    with ThreadPoolExecutor(max_workers=len(book_tag_lists)) as executor:
        for book_tag in book_tag_lists:
            executor.submit(fetch_list, book_tag, book_dicts)
    return book_dicts



def connect_mysql():
    global conn
    # 打开数据库连接  IP   用户名   密码   库名
    conn = pymysql.connect('localhost', 'root', '123456', 'shuping')


def insert_data(data=''):
    # 使用cursor()方法创建一个游标对象
    cursor = conn.cursor()
    # SQL语句:向数据表中插入数据
    sql = """insert into doubanbook(title, rating, people_num, author_info, pub_info, pub_date, price, booktype) value(%s, %s, %s, %s, %s, %s, %s, %s)"""

    # 异常处理
    # try:
    # 执行SQL语句
    cursor.execute(sql, data)
    # 提交事务到数据库执行
    conn.commit()  # 事务是访问和更新数据库的一个程序执行单元
    # except:
    #     # 如果发生错误则执行回滚操作
    #     conn.rollback()

def close_mysql():
    # 关闭数据库连接
    conn.close()


if __name__ == '__main__':
    connect_mysql()
    # 再次添加需要爬取的书籍类型
    book_tag_lists = ['Python', '算法']
    book_dicts = run_spider(book_tag_lists)
    print("----All Done----")

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171
  • 172
  • 173
  • 174
  • 175
  • 176
  • 177
  • 178
  • 179
  • 180
  • 181
  • 182
  • 183
  • 184
  • 185
  • 186
  • 187
  • 188
  • 189
  • 190
  • 191
  • 192
  • 193
  • 194
  • 195
  • 196
  • 197

结果:
在这里插入图片描述

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/401423
推荐阅读
相关标签
  

闽ICP备14008679号