赞
踩
threading
:
python3
版本之后的新线程函数
requests
: 自带函数,用于请求网络地址
os
: 自带函数,用于操作文件相关
openpyxl
: 开源第三方的excel导出的库,需要手动下载
pip install openpyxl
BeautifulSoup
:
html
代码美化工具
Element
部分
# -*- coding:UTF-8 -*-import requests,sysfrom bs4 import BeautifulSoupclass downloader(object): def __init__(self): self.server = 'http://bodboy.gitee.io/' self.target = 'http://bodboy.gitee.io/blog/' self.names = [] self.urls =[] self.nums = 0 def getUrls(self): req = requests.get(url=self.target) req.encoding ='utf-8' html = req.text div = BeautifulSoup(html) div_list = div.find_all('header' ,class_ ='article-header') self.nums = len(div_list) for each in div_list: list_url = each.find_all('a' ,class_ ='article-title') print(list_url) url = list_url[0] self.urls.append(self.server + url.get('href')) self.names.append(url.string) def write(self ,name ,path ,text): write_flag = True with open(path,'a',encoding='utf-8') as f: f.writelines(text) f.write('\n\n')if __name__ == "__main__": dl = downloader() dl.getUrls() print("开始下载文件....") for i in range(dl.nums): dl.write(dl.names[i], '博客文件.txt',dl.names[i] + '\r' + dl.urls[i]) sys.stdout.write('已下载:%.3f%%' % float(i/dl.nums) + '\r') sys.stdout.flush() print('文件下载完成')
import threadingimport requestsimport osfrom openpyxl import Workbookclass capturePc(): def __init__(self): # 接口地址 self.base_url = 'http://api.newibao.com/web/essay/publicEssayList' # 参数 self.params ={ 'page':1, 'size':64 } # 模拟浏览器 如果需要登录 记上session 需要添加 token self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 'Accept-Encoding': 'gzip, deflate' } def get_list(self): req = requests.get(url=self.base_url,headers=self.headers,params=self.params) # 数据转换 data = req.json() print() try : if data['data']['list'] : data = data['data']['list'] return data except : print("no data find") return None def get_img_list(self): res = self.get_list() imgUrls = [] if res : for e in res : if e['picUrl']: urls = e['picUrl'] imgUrls += urls print("图片列表为:",imgUrls) return imgUrls def get_excel_data(self): res = self.get_list() excel_list =[] if res : for e in res: cell = [e['addTime'],e['brief'],e['columnName'],e['details'],e['name'],e['updateTime']] excel_list.append(cell) print(excel_list) return excel_list def download_img(self,name): # 判断是否有文件夹 if not os.path.exists(name): os.mkdir(name) print("文件夹{}创建成功".format(name)) imgList = self.get_img_list() names =0 if imgList : for i in imgList: names += 1 threading.Thread(target=self.download, args=(names, i,name)).start() def download(self,name,image_url,path): print('开始下载:', name) content = requests.get(image_url).content path = '%s/%s.jpg' % (path, name) with open(path, 'wb') as f: f.write(content) print('下载完成', name) def export_excel(self): # 创建 文件对象 wb = Workbook() # 设置excel名称 data_fileName = '蓝海图文数据.xlsx' # 新建一个表 ws = wb.active # 设置表头 header =['创建时间','标题','分类','详情','名称','更新时间'] for row in range(len(header)): c = row +1 ws.cell(row=1, column = c,value =header[row]) # 填写表中内容 listIndex = self.get_excel_data() if listIndex : for imn in range(len(listIndex)): ws.append(listIndex[imn]) wb.save(filename = data_fileName) print("写入成功")if __name__ == "__main__": a = capturePc() # filename = str(input('请输入文件名,必须为英文名称')) a.export_excel()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。