赞
踩
import requests from lxml import etree import time ''' 思路: 1,确定想要爬取的小说及入口url 2,爬章节链接并通过字符串拼接得到所有章节详情页的 3,爬取书名 4,爬取每章的标题,爬取每章具体内容的文本 6,将每章小说以章节累加,并保存为一个单独的txt文件 ''' # 设置请求头 headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} url = 'http://www.biquge.info/84_84283/' def get_html(url): # 获取网页数据 html = requests.get(url, headers=headers) html.encoding = 'utf-8' html_code = html.text # 解析网页 soup = etree.HTML(html_code) # 返回解析后的页面内容 return soup # 获取各章节目录链接 def get_list(url): soup = get_html(url) # 查找所有章节的链接 list_box = soup.xpath('//*[@id="list"]/dl/dd/a/@href') # 新建列表用来储存list的url book_lists = [] for i in list_box: # 放进列表里 book_lists.append(url + i) return book_lists # 获取书的名称 def get_book_title(url): soup = get_html(url) book_title = soup.xpath('//*[@id="info"]/h1/text()') book_title = str(book_title) return book_title # 获取文章页 标题 def get_title(url): soup = get_html(url) title = soup.xpath('//*[@id="wrapper"]/div[4]/div/div[2]/h1/text()') return title # 获取文章页 正文 def get_novel_content(url): soup = get_html(url) # 获得需要的正文内容 content = soup.xpath('//*[@id="content"]/text()') return content # 保存到本地 def save_novel(url): book_lists = get_list(url) # title = get_title(url) book_title = get_book_title(url) num = 1 with open(book_title+'.txt', 'a', encoding='utf-8') as f: for list_url in book_lists: chapter_title = get_title(list_url) # 这个地方写的有问题,标题的标签没有清理干净 for t in chapter_title: f.write(t) chapter_content = get_novel_content(list_url) for c in chapter_content: f.write(c+"\n") # time.sleep(2) print('***第{}章下载完成***'.format(num)) num += 1 f.close() if __name__ == '__main__': save_novel(url)
参考: 一个妹子在B站的视频+微信链接https://mp.weixin.qq.com/s?__biz=MzIxOTcyMDM4OQ==&mid=2247483927&idx=1&sn=d4c9fcb6becc3e1d26a8d8385d8c2b99&chksm=97d7bdbda0a034ab3faf0f30ed50a1e35a0a9edcceb9b2ae9a0a6c7e4efd72a64cde07df439f&token=1524452913&lang=zh_CN#rd
比较优雅的代码,看的很舒服,思路很清晰:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。