当前位置:   article > 正文

Python爬虫实例一:网页小说爬取_python爬取深空彼岸最新章节名

python爬取深空彼岸最新章节名

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


前言

在大数据时代,互联网中拥有海量的信息数据,对个人而言,这些数据有的是有价值的,有的是没有价值。那么,如何在海量信息中找到并快速获取我们所需的大量数据?爬虫,就是很好的一种获取数据的方式。本章,我以网页小说爬取为例,为大家分享python爬虫的一些经验和遇到的一些问题。

一、爬虫所需库

requests:python的第三方库,专门用于发送HTTP请求。
安装requests库:pip install requests

二、网页分析

1.小说章节网页分析

以爬取某网站《深空**》为例,目标url:www.****.in/book/12793/
章节页码与源码分析
选取其中某一章,检查网页,可以找到这本小说所有章节的链接和名称。由此可以得出章节的路径xpath://div[@class=“panel panel-default”]/dl/dd/a/@href(有点懵???没关系,我们可以支直接在Google中查看一个章节的xpath,如下)

获取xpath演示

eg:某章的xpath为/html/body/div[2]/div[2]/dl/dd[1]/a

抓取章节代码如下(示例):

url = "https://www.xbxwx.in/book/12793/"  # 网址
response = requests.get(url, headers=headers)
response.encoding = 'GB2312'
html = etree.HTML(response.text)
url_j_list = ['https://www.xbxwx.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
url_List.append(url_j_list)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

2.小说内容网页分析

同上所述,可以得出小说内容的xpath://*[@id=“htmlContent”]/text()
在这里插入图片描述
抓取小说内容代码如下(示例):

rep = requests.get(url_x, headers=headers)
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
text = dom.xpath('//*[@id="htmlContent"]/text()')
with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
    for con in text:
        f.write(con)
    print(f'{name}_{i + 1} 下载完成')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

三、代码实例

1.获取小说章节路径

import requests
from lxml import etree
import xlwt


path = r'D:\Python_project\xiaoshuo\ '
headers = {
    "Referer": "https://www.***.in/book/12793/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}


def get_urls():
    # 由于网站中是分页显示章节,即:
    '''
    1-60章网址:https://www.***.in/book/12793/
    60-120章网址:https://www.***.in/book/12793/index_2.html
    网址源码:
    <select class="form-control" onchange="window.location=this.value;">
      <option value="/book/12793/">1</option>
      <option value="/book/12793/index_2.html">2</option>
      。。。。。。
      <option value="/book/12793/index_25.html" selected="">25(末页)</option>
    </select>

    /html/body/div[2]/div[2]/div[2]/select/option[1]
    '''

    url = "https://www.***.in/book/12793/"  # 网址
    response = requests.get(url, headers=headers)
    response.encoding = 'GB2312'
    html = etree.HTML(response.text)

    url_N_id = ['https://www.***.in/' + x for x in html.xpath('//select[@class="form-control"]/option/@value')]
    number = url_N_id.__len__()
    url_List = []
    for j in range(number):
        # 第N网址中章节的url列表
        '''
        获取章节的网址 :可直接在网页检查-元素-找到章节网址-复制xpath
        <dd class="col-md-3">
         <a href="38415404.html" title="第一章 旧土">第一章 旧土</a>
        </dd>
        右键->复制xpath: xpath = /html/body/div[2]/div[2]/dl/dd[1]/a
        即 xpath ==> "38415404.html"
        '''
        if j != 0:
            url_j_id = url_N_id[j]
            response = requests.get(url_j_id, headers=headers)
            response.encoding = 'GB2312'
            html = etree.HTML(response.text)

        url_j_list = ['https://www.***.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
        url_List.append(url_j_list)

    url_list = []
    for m in range(url_List.__len__()):
        for n in range(url_List[m].__len__()):
             url_list.append(url_List[m][n])

    return url_list


def main():
    urls = get_urls()

    # 创建工作薄对象
    workbook = xlwt.Workbook(encoding='GB2312')
    Sheet_name = workbook.add_sheet('深空彼岸小说章节网站')
    Headers = ['序号', '网址']
    for index, Header in enumerate(Headers):
        Sheet_name.write(0, index, Header)

    for index, url in enumerate(urls):
        Sheet_name.write(index+1, 0, index+1)
        Sheet_name.write(index + 1, 1, url)

    workbook.save('xiaoshuo.xls')


if __name__ == '__main__':
    main()

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83

2.爬取小说内容

import requests
from lxml import etree
import xlrd
import time
import random
import chardet # 自动检测编码格式


path = r'D:\Python_project\xiaoshuo\ '
path1 = r'D:\Python_project\xiaoshuo\xs'
headers = {
    "Referer": "https://www.***.in/book/12793/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}



def get_text(url):
    rep = requests.get(url, headers=headers)
    '''
    rep.encoding = 'GB2312'
    '''
    encoding = chardet.detect(rep.content)['encoding']
    rep.encoding = encoding
    dom = etree.HTML(rep.text)
    # //*[@id="content"]/div[1]/h1/text()
    # 因为有章节分为上下页,还需下载下一页
    # 下一页网址:
    '''
    eg:第一页 https://www.***.in/book/12793/38415404.html
        第二页 https://www.***.in/book/12793/38415404_2.html
        第X页 //*[@id="content"]/div[1]/h1/small
    '''
    # 章节页数
    strnum = dom.xpath('//*[@id="content"]/div[1]/h1/small/text()')
    if len(strnum) == 0:
        rep = requests.get(url, headers=headers)
        '''
        rep.encoding = 'GB2312'
        '''
        encoding = chardet.detect(rep.content)['encoding']
        rep.encoding = encoding
        dom = etree.HTML(rep.text)
        # //*[@id="content"]/div[1]/h1/text()
        name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
        text = dom.xpath('//*[@id="htmlContent"]/text()')
        with open(path + f'{name}.txt', 'w', encoding='utf-8') as f:
            for con in text:
                f.write(con)
            print(f'{name} 下载完成')
    else:
        str1 = strnum[0][3:4]
        num = int(str1)
        for i in range(num):
            if i == 0:
                url_x = url
            else:
                url_x = url[:-5] + '_' + str(i + 1) + '.html'
            rep = requests.get(url_x, headers=headers)
            '''
            rep.encoding = 'GB2312'
            '''
            encoding = chardet.detect(rep.content)['encoding']
            rep.encoding = encoding
            dom = etree.HTML(rep.text)
            # //*[@id="content"]/div[1]/h1/text()
            name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
            text = dom.xpath('//*[@id="htmlContent"]/text()')
            with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
                for con in text:
                    f.write(con)
                print(f'{name}_{i + 1} 下载完成')


def main():

    # 获取存储在xls文件中的小说网址
    workbook = xlrd.open_workbook('xiaoshuo.xls')
    Sheet_name = workbook.sheet_by_name('深空彼岸小说章节网站')
    print(Sheet_name.name, Sheet_name.ncols, Sheet_name.nrows)

    # 获取第一个sheet
    Sheet1 = workbook.sheet_by_index(0)

    # 获取第2列单元格数据
    cols = Sheet1.col_values(1)
    # print(urls)
    urls = cols[1264:1269]

    for url in urls:
        get_text(url)
        time.sleep(random.randint(1, 3))

if __name__ == '__main__':
    main()


  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97

总结

以上就是关于python爬虫相关的内容,本文仅仅简单介绍了爬虫在网页小说中的使用案例,后续我将继续更新python爬虫相关的应用案例,谢谢。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/繁依Fanyi0/article/detail/792274
推荐阅读
相关标签
  

闽ICP备14008679号