赞
踩
提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
在大数据时代,互联网中拥有海量的信息数据,对个人而言,这些数据有的是有价值的,有的是没有价值。那么,如何在海量信息中找到并快速获取我们所需的大量数据?爬虫,就是很好的一种获取数据的方式。本章,我以网页小说爬取为例,为大家分享python爬虫的一些经验和遇到的一些问题。
requests:python的第三方库,专门用于发送HTTP请求。
安装requests库:pip install requests
以爬取某网站《深空**》为例,目标url:www.****.in/book/12793/
选取其中某一章,检查网页,可以找到这本小说所有章节的链接和名称。由此可以得出章节的路径xpath://div[@class=“panel panel-default”]/dl/dd/a/@href(有点懵???没关系,我们可以支直接在Google中查看一个章节的xpath,如下)
获取xpath演示
eg:某章的xpath为/html/body/div[2]/div[2]/dl/dd[1]/a
抓取章节代码如下(示例):
url = "https://www.xbxwx.in/book/12793/" # 网址
response = requests.get(url, headers=headers)
response.encoding = 'GB2312'
html = etree.HTML(response.text)
url_j_list = ['https://www.xbxwx.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')]
url_List.append(url_j_list)
同上所述,可以得出小说内容的xpath://*[@id=“htmlContent”]/text()
抓取小说内容代码如下(示例):
rep = requests.get(url_x, headers=headers)
encoding = chardet.detect(rep.content)['encoding']
rep.encoding = encoding
dom = etree.HTML(rep.text)
# //*[@id="content"]/div[1]/h1/text()
name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0]
text = dom.xpath('//*[@id="htmlContent"]/text()')
with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f:
for con in text:
f.write(con)
print(f'{name}_{i + 1} 下载完成')
import requests from lxml import etree import xlwt path = r'D:\Python_project\xiaoshuo\ ' headers = { "Referer": "https://www.***.in/book/12793/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1" } def get_urls(): # 由于网站中是分页显示章节,即: ''' 1-60章网址:https://www.***.in/book/12793/ 60-120章网址:https://www.***.in/book/12793/index_2.html 网址源码: <select class="form-control" onchange="window.location=this.value;"> <option value="/book/12793/">第1页</option> <option value="/book/12793/index_2.html">第2页</option> 。。。。。。 <option value="/book/12793/index_25.html" selected="">第25页(末页)</option> </select> /html/body/div[2]/div[2]/div[2]/select/option[1] ''' url = "https://www.***.in/book/12793/" # 网址 response = requests.get(url, headers=headers) response.encoding = 'GB2312' html = etree.HTML(response.text) url_N_id = ['https://www.***.in/' + x for x in html.xpath('//select[@class="form-control"]/option/@value')] number = url_N_id.__len__() url_List = [] for j in range(number): # 第N网址中章节的url列表 ''' 获取章节的网址 :可直接在网页检查-元素-找到章节网址-复制xpath <dd class="col-md-3"> <a href="38415404.html" title="第一章 旧土">第一章 旧土</a> </dd> 右键->复制xpath: xpath = /html/body/div[2]/div[2]/dl/dd[1]/a 即 xpath ==> "38415404.html" ''' if j != 0: url_j_id = url_N_id[j] response = requests.get(url_j_id, headers=headers) response.encoding = 'GB2312' html = etree.HTML(response.text) url_j_list = ['https://www.***.in/book/12793/' + x for x in html.xpath('//div[@class="panel panel-default"]/dl/dd/a/@href')] url_List.append(url_j_list) url_list = [] for m in range(url_List.__len__()): for n in range(url_List[m].__len__()): url_list.append(url_List[m][n]) return url_list def main(): urls = get_urls() # 创建工作薄对象 workbook = xlwt.Workbook(encoding='GB2312') Sheet_name = workbook.add_sheet('深空彼岸小说章节网站') Headers = ['序号', '网址'] for index, Header in enumerate(Headers): Sheet_name.write(0, index, Header) for index, url in enumerate(urls): Sheet_name.write(index+1, 0, index+1) Sheet_name.write(index + 1, 1, url) workbook.save('xiaoshuo.xls') if __name__ == '__main__': main()
import requests from lxml import etree import xlrd import time import random import chardet # 自动检测编码格式 path = r'D:\Python_project\xiaoshuo\ ' path1 = r'D:\Python_project\xiaoshuo\xs' headers = { "Referer": "https://www.***.in/book/12793/", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1" } def get_text(url): rep = requests.get(url, headers=headers) ''' rep.encoding = 'GB2312' ''' encoding = chardet.detect(rep.content)['encoding'] rep.encoding = encoding dom = etree.HTML(rep.text) # //*[@id="content"]/div[1]/h1/text() # 因为有章节分为上下页,还需下载下一页 # 下一页网址: ''' eg:第一页 https://www.***.in/book/12793/38415404.html 第二页 https://www.***.in/book/12793/38415404_2.html 第X页 //*[@id="content"]/div[1]/h1/small ''' # 章节页数 strnum = dom.xpath('//*[@id="content"]/div[1]/h1/small/text()') if len(strnum) == 0: rep = requests.get(url, headers=headers) ''' rep.encoding = 'GB2312' ''' encoding = chardet.detect(rep.content)['encoding'] rep.encoding = encoding dom = etree.HTML(rep.text) # //*[@id="content"]/div[1]/h1/text() name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0] text = dom.xpath('//*[@id="htmlContent"]/text()') with open(path + f'{name}.txt', 'w', encoding='utf-8') as f: for con in text: f.write(con) print(f'{name} 下载完成') else: str1 = strnum[0][3:4] num = int(str1) for i in range(num): if i == 0: url_x = url else: url_x = url[:-5] + '_' + str(i + 1) + '.html' rep = requests.get(url_x, headers=headers) ''' rep.encoding = 'GB2312' ''' encoding = chardet.detect(rep.content)['encoding'] rep.encoding = encoding dom = etree.HTML(rep.text) # //*[@id="content"]/div[1]/h1/text() name = dom.xpath('//*[@id="content"]/div[1]/h1/text()')[0] text = dom.xpath('//*[@id="htmlContent"]/text()') with open(path1 + '/' + f'{name}_{i + 1}.txt', 'w', encoding='utf-8') as f: for con in text: f.write(con) print(f'{name}_{i + 1} 下载完成') def main(): # 获取存储在xls文件中的小说网址 workbook = xlrd.open_workbook('xiaoshuo.xls') Sheet_name = workbook.sheet_by_name('深空彼岸小说章节网站') print(Sheet_name.name, Sheet_name.ncols, Sheet_name.nrows) # 获取第一个sheet Sheet1 = workbook.sheet_by_index(0) # 获取第2列单元格数据 cols = Sheet1.col_values(1) # print(urls) urls = cols[1264:1269] for url in urls: get_text(url) time.sleep(random.randint(1, 3)) if __name__ == '__main__': main()
以上就是关于python爬虫相关的内容,本文仅仅简单介绍了爬虫在网页小说中的使用案例,后续我将继续更新python爬虫相关的应用案例,谢谢。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。