赞
踩
利用Python爬取天天基金的数据。
先打开天天基金网,然后选择感兴趣的主题基金,这里以煤炭为例。
进入页面之后,复制一个基金名称,在浏览器的控制台Network搜索这个名称。
点击我们搜索到的js,切换到Headers,这里有几个信息是我们需要的:General里的Request URL、Request Headers里的Cookie、Host、Referer、User-Agent。
import pandas as pd import requests import re import json import time import math #定义一个fund_obj用于存放,刚才在浏览器上获取到的几个值 fund_obj = { 'isbuy': 1, 'sort': 'TRY', 'sorttype': 'desc', 'callback': 'jQuery18306680153539612363_1681216767557', 'cookie': 'intellpositionL=1152px; em_hq_fls=js; intellpositionT=455px; qgqp_b_id=2c2759821488d533c53576ad44a41e5d; HAList=ty-1-000001-%u4E0A%u8BC1%u6307%u6570%2Cty-90-BK1036-%u534A%u5BFC%u4F53%2Ca-sh-603713-%u5BC6%u5C14%u514B%u536B%2Cf-0-000001-%u4E0A%u8BC1%u6307%u6570%2Ca-sh-600559-%u8001%u767D%u5E72%u9152%2Ca-sh-603195-%u516C%u725B%u96C6%u56E2%2Ca-sz-002236-%u5927%u534E%u80A1%u4EFD%2Ca-sh-600759-%u6D32%u9645%u6CB9%u6C14%2Cf-0-399439-%u56FD%u8BC1%u6CB9%u6C14%2Ca-sz-000049-%u5FB7%u8D5B%u7535%u6C60; EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND0=null; EMFUND8=04-11%2015%3A07%3A30@%23%24%u534E%u5546%u4FE1%u7528%u589E%u5F3A%u503A%u5238A@%23%24001751; AUTH_FUND.EASTMONEY.COM_GSJZ=AUTH*TTJJ*TOKEN; EMFUND9=04-11 15:32:10@#$%u62DB%u5546%u4E2D%u8BC1%u767D%u9152%u6307%u6570%28LOF%29A@%23%24161725; st_si=97748096283622; st_pvi=48514956678552; st_sp=2019-09-26%2018%3A30%3A34; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=1; st_psi=2023041120385018-112200312942-2229925924; st_asi=delete', 'host': 'api.fund.eastmoney.com', 'referer': 'http://fund.eastmoney.com/', 'useragent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'themecallback': 'jQuery18306592616272463223_1681284141591', } #基金信息 class FundInfo: #初始化 def __init__(self, page_index = 1, page_size = 10, total_page = 0, total_count = 0): self.pageIndex = page_index self.pageSize = page_size self.totalPage = total_page self.totalCount = total_count self.dataList = [] self.themeName = '' #请求接口 def get_response(self, parame_obj, parame_url): # 伪装 # Request Headers里的Cookie、Host、Referer、User-Agent。 headers = { 'Cookie': parame_obj['cookie'], 'Host': parame_obj['host'], 'Referer': parame_obj['referer'], 'User-Agent': parame_obj['useragent'], } response = requests.get(url=parame_url, headers=headers) return response #获取基金主题 def get_fund_theme(self, parame_obj): url = f'http://api.fund.eastmoney.com/ztjj//GetBKDetailInfoNew?callback={parame_obj["themecallback"]}&tp=801950&_=1681284141906' response = self.get_response(parame_obj, url) # 获取数据 data = response.text print('theme===', data) # 解析数据 res = re.findall(f'{parame_obj["themecallback"]}\((.*)\)', data)[0] res = json.loads(res) self.themeName = res['Data']['INDEXNAME'] #获取基金列表数据 def get_fund_data(self, parame_obj): # General里的Request URL # 将callback、sort、sorttype、pageindex、pagesize、isbuy这几个参数提取出来 url = f'http://api.fund.eastmoney.com/ztjj/GetBKRelTopicFundNew?callback={parame_obj["callback"]}&sort={parame_obj["sort"]}&sorttype={parame_obj["sorttype"]}&pageindex={self.pageIndex}&pagesize={self.pageSize}&tp=801950&isbuy={parame_obj["isbuy"]}' response = self.get_response(parame_obj, url) # 获取数据 data = response.text print(data) # 解析数据 res = re.findall(f'{parame_obj["callback"]}\((.*)\)', data)[0] res = json.loads(res) return res #获取总条数、总页数 def get_total_page(self, parame_res): self.totalCount = int(parame_res['TotalCount']) self.totalPage = math.ceil(self.totalCount / self.pageSize) def get_fund_info(self, parame_res): datas = parame_res['Data'] for list in datas: itemList = [] for key, val in list.items(): if 'SHORTNAME' in key: itemList.append(str(val)) elif 'FCODE' in key: itemList.append(str(val)) elif 'DWJZ' in key: itemList.append(str(val)) elif 'RZDF' in key: itemList.append(str(val)) elif 'SYRQ' in key: itemList.append(str(val)) elif 'FTYPE' in key: itemList.append(str(val)) elif 'RELATION' in key: itemList.append(str(val)) elif 'SYL_Z' in key: itemList.append(str(val)) elif 'SYL_Y' in key: itemList.append(str(val)) elif 'SYL_3Y' in key: itemList.append(str(val)) elif 'SYL_6Y' in key: itemList.append(str(val)) elif 'SYL_JN' in key: itemList.append(str(val)) elif 'SYL_1N' in key: itemList.append(str(val)) elif 'SYL_2N' in key: itemList.append(str(val)) elif 'SYL_3N' in key: itemList.append(str(val)) elif 'SYL_LN' in key: itemList.append(str(val)) elif 'RATE' in key: itemList.append(str(val)) elif 'SOURCERATE' in key: itemList.append(str(val)) elif 'MINSG' in key: itemList.append(str(val)) elif 'ISSALES' in key: itemList.append(str(val)) elif 'ISBUY' in key: itemList.append(str(val)) self.dataList.append(itemList) #分页 def change_page(self, parame_obj, parame_res): self.get_fund_theme(parame_obj) timestamp = int(time.time()) for page in range(self.totalPage): print(f'正在爬取第{page+1}页的数据内容') time.sleep(1) self.get_fund_data(parame_obj) self.get_fund_info(parame_res) self.pageIndex += 1 if self.pageIndex > self.totalPage: df = pd.DataFrame(self.dataList, columns=['基金代码', '', '日期', '原手续费', '净值', '日增长率', '手续费', 'ISBUY', '购买起点', '基金名称', '近1周','近1月','近3月','近6月','近1年','近2年','近3年','今年来','成立来', 'ISSALES','基金类型']) df.to_excel(f'{self.themeName}基金{timestamp}.xlsx') break fund_info = FundInfo() res = fund_info.get_fund_data(fund_obj) fund_info.get_total_page(res) fund_info.change_page(fund_obj, res)
运行上面的代码,然后就可以得到一份数据。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。