赞
踩
import requests import bs4 from bs4 import BeautifulSoup import os def getHTMLText(url): try: r = requests.get(url)"""从服务器获取HTML网页内容""" r.raise_for_status()"""判断是否成功获取""" r.encoding=r.apparent_encoding return r.text"""返回网页内容""" except: return "" """ 这个函数负责把所有的网页链接保存进列表中 这里暂时只保存前24章的链接 """ def fillWebList(html,wlist): i = 0 soup = BeautifulSoup(html,"html.parser") for dd in soup.find('dl').children: i = i + 1 if i >= 24: if isinstance(dd,bs4.element.Tag):"""判断属性是否是标签""" dds=dd.contents link = "http://www.tianxiabachang.cn"+dds[0].attrs['href'] wlist.append(link) def enterLink(ulist):"""进入链接获取小说内容""" i = 0 for link in ulist: i = i + 1 url = link html = getHTMLText(url) soup = BeautifulSoup(html,"html.parser") #res = soup.find(id='content') f = open("C:/Users/联想/Desktop/exa.txt","a+",encoding = 'utf-8-sig')"""这是我的电脑文件,不同的电脑文件路径不同,需要改""" f.write("第%d章"%i) f.write('\n') if i > 24: break for br in soup.find(id='content').children:"""找到id属性值为content的,并遍历其所有子元素""" if br.string is not None:"""判断是否有字符串""" f.write(br.string) f.write('\n') print("打印成功") def main(): wlist=[] url = "http://www.tianxiabachang.cn/5_5731/" html = getHTMLText(url) fillWebList(html,wlist) enterLink(wlist)
大佬勿喷。。。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。