当前位置:   article > 正文

python爬取新笔趣阁小说_python爬取笔趣阁小说

python爬取笔趣阁小说

1.首先导入爬虫所需的模块:

  1. # 导入所需的模块
  2. import requests
  3. from bs4 import BeautifulSoup
  4. import os
'
运行

2.用os库创建保存小说的文件夹:

  1. # 创建小说目录
  2. if not os.path.exists("GT病毒进化者"):
  3. os.mkdir("GT病毒进化者")

3.设置请求头:

  1. # 设置请求头
  2. headers = {
  3. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
  4. }
'
运行

4.根据url找到小说的目录页面:

  1. # 请求小说的目录页面
  2. url = "https://www.xxbqg5200.com/shu/13515/"
  3. res = requests.get(url, headers=headers)
  4. res.encoding = "gbk"

5.解析页面所有的章节链接:

  1. url2="https://www.xxbqg5200.com"
  2. # 解析目录页面获取所有章节链接
  3. soup = BeautifulSoup(res.text, "html.parser")
  4. chapters = soup.find("div", id="list").find_all("a")

6.遍历每个章节链接,请求页面并保存内容到指定目录:

  1. for chapter in chapters:
  2. chapter_url = url2 + chapter["href"]
  3. # print(chapter['href'])
  4. # print(chapter_url)
  5. chapter_res = requests.get(chapter_url, headers=headers)
  6. chapter_res.encoding = "gbk"
  7. chapter_soup = BeautifulSoup(chapter_res.text, "html.parser")
  8. # print(chapter_soup)
  9. chapter_title = chapter_soup.find("div", class_="bookname").h1.text
  10. chapter_content = chapter_soup.find("div", id="content").text.strip()
  11. with open(f"GT病毒进化者/{chapter_title}.txt", "w", encoding="utf-8") as f:
  12. f.write(chapter_content)
  13. print(f"{chapter_title} 保存成功!")

完整代码:

  1. # 导入所需的模块
  2. import requests
  3. from bs4 import BeautifulSoup
  4. import os
  5. # 创建小说目录
  6. if not os.path.exists("GT病毒进化者"):
  7. os.mkdir("GT病毒进化者")
  8. # 设置请求头
  9. headers = {
  10. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
  11. }
  12. # 请求小说的目录页面
  13. url = "https://www.xxbqg5200.com/shu/13515/"
  14. res = requests.get(url, headers=headers)
  15. res.encoding = "gbk"
  16. url2="https://www.xxbqg5200.com"
  17. # 解析目录页面获取所有章节链接
  18. soup = BeautifulSoup(res.text, "html.parser")
  19. chapters = soup.find("div", id="list").find_all("a")
  20. # 遍历每个章节链接,请求页面并保存章节内容
  21. for chapter in chapters:
  22. chapter_url = url2 + chapter["href"]
  23. # print(chapter['href'])
  24. # print(chapter_url)
  25. chapter_res = requests.get(chapter_url, headers=headers)
  26. chapter_res.encoding = "gbk"
  27. chapter_soup = BeautifulSoup(chapter_res.text, "html.parser")
  28. # print(chapter_soup)
  29. chapter_title = chapter_soup.find("div", class_="bookname").h1.text
  30. chapter_content = chapter_soup.find("div", id="content").text.strip()
  31. with open(f"GT病毒进化者/{chapter_title}.txt", "w", encoding="utf-8") as f:
  32. f.write(chapter_content)
  33. print(f"{chapter_title} 保存成功!")

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/神奇cpp/article/detail/825130
推荐阅读
相关标签
  

闽ICP备14008679号