赞
踩
用30行代码爬取某小说网站上的一篇小说
一、导入模块
import requests
from lxml import etree
import time
二、获取网站的响应信息,并以text打印
url = 'https://www.biquge365.net/newbook/33411/'
head = {
'Referer': 'https://www.biquge365.net/book/33411/',
'users-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39'
}
response = requests.get(url,headers = head,verify = False)
# print(response.text)
html = etree.HTML(response.text)
三、获取小说的标题及小说目录的href
#[0]列表的第0位
novel_name = html.xpath('/html/body/div[1]/div[3]/div[1]/h1/text()')[0]
# print(novel_name)
novel_directory = html.xpath('/html/body/div[1]/div[4]/ul/li[*]/a/@href')
# print(novel_directory)
#访问太快易报错,设置休眠时间
time.sleep(5)
四、用for循环遍列小说目录,并获取小说正文
for i in novel_directory:
com_url = 'https://www.biquge365.net'+i
# print(com_url)
response2 = requests.get(com_url,headers=head)
html2 = etree.HTML(response2.text)
novel_chapter = html2.xpath('//*[@id="neirong"]/h1/text()')[0]
# print(novel_chapter)
novel_content = '\n'.join(html2.xpath('//*[@id="txt"]/text()'))
# print(novel_content)
五、保存小说
# 'w'每次写入文件时会把上一次文件中内容清空,'a'追加内容,不会覆盖前面的内容
with open('D:\\小说\\'+novel_chapter+'.txt','w',encoding='utf-8') as file:
file.write(novel_chapter+'\n'+novel_content+'\n')
file.close()
print("下载成功"+novel_chapter)
六、下载成功展示
七、保存在电脑上的形式
八、效果展示
文章对你有帮助的话,麻烦点个赞吧!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。