赞
踩
- #先导入爬虫运用到的两个库和对文件操作的os模块
- import requests
- import lxml.html
- from lxml import etree
- import os
-
- #获取网站的url以及伪装自己
- url="https://www.1biqug.net"
- base_url="https://www.1biqug.net/29/29370/"
- headers = {
- 'Host': 'www.1biqug.net',
- 'Referer': 'https://www.1biqug.net/',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'}
-
-
- #爬取文章的目录,用etree方法解析成html文本形式寻找信息
- req=requests.get(url=base_url,headers=headers)
- html=lxml.etree.HTML(req.text)
- req2=etree.tostring(html)
- str=req2.decode("utf-8")
-
- novel_save_dir=os.path.join(os.getcwd(),"novel_cache/")#文章的加入入径
- herfs=html.xpath("//dd/a/@href")[12:]#获取文章链接
-
- #对每章的链接进行循环访问
- for herf in herfs:
- urls=url+herf
- re1=requests.get(urls)
- re1.encoding="utf-8"
- content1=etree.HTML(re1.text)
- titlt_head=content1.xpath("/html/body/div/div/div/div/h1/text()")[0]#每章的名字
- print(titlt_head)
-
- content2=content1.xpath("/html/body/div/div/div/div//text()")
- #循环了每章的内容
-
-
- #保存文章
- neir=""
- for x in content2:
- neir = neir+x+"\n"
- with open(os.path.join("novel_cache",titlt_head +".txt"),"a",encoding="utf-8") as f:
- f.write(neir)
- print("下载成功")
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。