赞
踩
https://download.csdn.net/download/weixin_66397563/87651644?spm=1001.2014.3001.5503
- import requests
- import re
- import os
- import pandas as pd
-
- cookie = '''Hm_lvt_af1fda4748dacbd3ee2e3a69c3496570=1656413001,1656428399,1656458742,1656466843; TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE2NTY0ODU5OTMsImp0aSI6IjI3MzA0NSIsImlzcyI6IjE1MCoqKio0NTcyIn0.S6wbuYOwHzcV-VTkCB3MNxy7L5t1mpWJVv-NCsXzJn0; Hm_lpvt_af1fda4748dacbd3ee2e3a69c3496570=1656470377'''
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.7 Safari/537.36',
-
- 'Cookie': cookie}
- # 防反爬虫,模拟浏览器
-
- b1 = "data/软科排行/"
- if not os.path.exists(b1) or not os.path.isdir(b1):
- os.makedirs(b1)
-
- s1 = os.path.join(b1, "中国大学排名/")
- if not os.path.exists(s1):
- os.makedirs(s1)
-
- s2 = os.path.join(b1, "中国最好学科排名/")
- if not os.path.exists(s2):
- os.makedirs(s2)
-
- s3 = os.path.join(b1, "中国大学专业排名/")
- if not os.path.exists(s3):
- os.makedirs(s3)
-
- s4 = os.path.join(b1, "世界大学学术排名/")
- if not os.path.exists(s4):
- os.makedirs(s4)
-
- s5 = os.path.join(b1, "世界一流学科排名/")
- if not os.path.exists(s5):
- os.makedirs(s5)
-
- s6 = os.path.join(b1, "中国两岸四地大学排名/")
- if not os.path.exists(s6):
- os.makedirs(s6)
-
- s7 = os.path.join(b1, "全球体育类院系学术排名/")
- if not os.path.exists(s7):
- os.makedirs(s7)
-
-
- def get_payload_js(url):
- # 每个url的 payload.js 是不一样的通过这个函数获取,返回对应数据
-
- resp = requests.get(url, headers=headers)
- if resp.status_code == 404:
- return None
- # 状态值为404,获取失败,榜单不存在的情况
- # 正则匹配payload.js
- payload_href = re.findall(r'/_nuxt/static/(.*?)/payload\.js"', resp.text, re.S)
- payload_href = payload_href[0]
- js_url = "https://www.shanghairanking.cn/_nuxt/static/{}/payload.js".format(payload_href)
- resp2 = requests.get(js_url, headers=headers)
- html = resp2.text
- return html
-
-
- def get_num(s):
- s = s.split("-")[0]
- return int(s)
-
-
- # 对字符串进行分割并返回一个列表
-
-
- def get_bcur_data(year):
- # 获取中国大学排名
- # https://www.shanghairanking.cn/_nuxt/static/1655530991/rankings/bcur/201511/payload.js
- url = f"https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year={year}"
- resp = requests.get(url, headers=headers)
- if resp.status_code == 404:
- return None
- data = resp.json()
-
- if year in [2022, 2021, 2020]:
- k = "办学层次"
- else:
- k = "生源质量"
- items = {
- "排名": [],
- "学校名称": [],
- "省市": [],
- "类型": [],
- "总分": [],
- k: [],
- }
- for i in data['data']['rankings']:
- items['排名'].append(i['ranking'])
- items['学校名称'].append(i['univNameCn'])
- items['省市'].append(i['province'])
- items['类型'].append(i['univCategory'])
- items['总分'].append(i['score'])
- if year == 2015:
- items[k].append(i['indData']['1'])
- if year == 2016:
- items[k].append(i['indData']['10'])
- if year == 2017:
- items[k].append(i['indData']['19'])
- if year == 2018:
- items[k].append(i['indData']['28'])
- if year == 2019:
- items[k].append(i['indData']['38'])
- if year == 2020:
- items[k].append(i['indData']['59'])
- if year == 2021:
- items[k].append(i['indData']['159'])
- if year == 2022:
- items[k].append(i['indData']['271'])
- return items
-
-
- def get_rtugc_data(year):
- # 中国两岸四地大学排名
- url = "https://www.shanghairanking.cn/rankings/rtugc/{}".format(year)
- payload_js = get_payload_js(url)
- if not payload_js:
- return None
- params = get_params(payload_js)
- data = re.findall(r'\{ranking(.*?)\},', payload_js, re.S)
- items = {
- "排名": [],
- "学校名称": [],
- "国家/地区": [],
- "总分": [],
- "研究生比例": [],
- }
- for i, _ in enumerate(data):
- _ = "ranking" + _
-
- rank = get_data(_, r'ranking\:(.*?)\,', params)
- univNameCn = get_data(_, r'univNameCn\:(.*?)\,', params)
- score = get_data(_, r'score\:(.*?)\,', params)
- region = get_data(_, r'region\:(.*?)\,', params)
- indData = get_data(_, r'data\:\{\"(.*?)\"\:(.*?)\,', params)
- items['排名'].append(rank)
- items['学校名称'].append(univNameCn)
- items['国家/地区'].append(region)
- items['总分'].append(score)
- items['研究生比例'].append(indData)
- # 匹配对应数据并写入字典里面
- return items
-
-
- def get_grsssd_data(year):
- # 全球体育类院系学术排名
- url = "https://www.shanghairanking.cn/rankings/grsssd/{}".format(year)
- payload_js = get_payload_js(url)
- if not payload_js:
- return None
- params = get_params(payload_js)
- data = re.findall('\{ranking(.*?)\},', payload_js, re.S)
- items = {
- "排名": [],
- "学校名称": [],
- "国家/地区": [],
- "总分": [],
- "论文数": [],
- }
- for i, _ in enumerate(data):
- _ = "ranking" + _
- rank = get_data(_, 'ranking\:(.*?)\,', params)
- univNameCn = get_data(_, 'univNameCn\:(.*?)\,', params)
- score = get_data(_, 'score\:(.*?)\,', params)
- region = get_data(_, 'region\:(.*?)\,', params)
- indData = get_data(_, 'indData\:\{\"(.*?)\"\:(.*?)\,', params)
- items['排名'].append(rank)
- items['学校名称'].append(univNameCn)
- items['国家/地区'].append(region)
- if score:
- items['总分'].append(score)
- else:
- items['总分'].append('')
- items['论文数'].append(indData)
- return items
-
-
- def get_arwu_data(year):
- # 世界大学学术排名
- url = f"https://www.shanghairanking.cn/api/pub/v1/arwu/rank?year={year}"
- print(url)
- resp = requests.get(url, headers=headers)
- data = resp.json()
- if year in '2003':
- k = "教师获奖"
- else:
- k = "校友获奖"
- items = {
- "排名": [],
- "学校名称": [],
- "国家/地区": [],
- "排名(国家/地区)": [],
- "总分": [],
- k: [],
- }
- inds = {_['nameCn']: _['code'] for _ in data['data']['inds']}
- # 字典生成式
- for r in data['data']['rankings']:
- items['排名'].append(r['ranking'])
- items['学校名称'].append(r['univNameCn'])
- items['国家/地区'].append(r['region'])
- items['排名(国家/地区)'].append(r['regionRanking'])
- items['总分'].append(r['score'])
- if k in inds:
- items[k].append(r['indData'].get(inds[k], "0"))
- else:
- items[k].append('')
- return items
-
-
- def get_bcsr_name(year):
- # 获取中国最好学科排名的学科编码及名称
- # 2017 https://www.shanghairanking.cn/_nuxt/static/1655530991/rankings/bcsr/2017/payload.js
- url = f"https://www.shanghairanking.cn/_nuxt/static/1655530991/rankings/bcsr/{year}/payload.js"
- resp = requests.get(url, headers=headers)
- params = get_params(resp.text)
- data = re.findall('\{code(.*?)\},', resp.text, re.S)
- id_items = []
- for _ in data:
- _ = "code" + _
- _id = get_data(_, 'code\:(.*?)\,', params)
- if len(_id) <= 2:
- continue
- nameCn = get_data(_, 'nameCn\:(.*?)\,', params)
- id_items.append({"id": _id, "name": nameCn})
- return id_items
-
-
- def get_bcsr_data(id_s, year):
- # 获取中国最好学科排名的学科编码及名称对应的排行榜
- url = f"https://www.shanghairanking.cn/api/pub/v1/bcsr/rank?target_yr={year}&yr={year - 1}&subj_code={id_s}"
- resp = requests.get(url, headers=headers)
- data = resp.json()
- items = {
- f"{year}排名": [],
- f"{year - 1}排名": [],
- "全部层次": [],
- "学校名称": [],
- "总分": []
- }
- for j in data['data']['rankings']:
- items[f"{year}排名"].append(j['ranking'])
- if j['contrastRanking'] and j['contrastRanking'].get(f"{year - 1}"):
- items[f"{year - 1}排名"].append(j['contrastRanking'].get(f"{year - 1}"))
- else:
- items[f"{year - 1}排名"].append('')
- items['全部层次'].append(j['rankPctTop'])
- items['学校名称'].append(j['univNameCn'])
- items['总分'].append(j['score'])
- return items
-
-
- def get_gras_name(year):
- # 获取世界一流学科排名排名的学科编码及名称
- # https://www.shanghairanking.cn/_nuxt/static/1655530313/rankings/gras/2021/payload.js
- if year in [2019, 2020]:
- url = f"https://www.shanghairanking.cn/_nuxt/static/1655530991/rankings/gras/{year}/payload.js"
- else:
- url = f"https://www.shanghairanking.cn/_nuxt/static/1655530313/rankings/gras/{year}/payload.js"
- resp = requests.get(url, headers=headers)
- params = get_params(resp.text)
- data = re.findall('\{code(.*?)\},', resp.text, re.S)
- id_items = []
- for _ in data:
- _ = "code" + _
- _id = get_data(_, 'code\:(.*?)\,', params)
- if len(_id) <= 4:
- continue
- nameCn = get_data(_, 'nameCn\:(.*?)\,', params)
- id_items.append({"id": _id, "name": nameCn})
- return id_items
-
-
- def get_gras_data(id_s, year):
- # 获取世界一流学科排名排名的学科编码及名称对应的排行榜
- url = f"https://www.shanghairanking.cn/api/pub/v1/gras/rank?year={year}&subj_code={id_s}"
- resp = requests.get(url, headers=headers)
- data = resp.json()
- if year != 2020 | 2021:
- k = "论文总数"
- else:
- k = "重要期刊论文数"
- items = {
- f"{year}排名": [],
- "学校名称": [],
- "国家/地区": [],
- "总分": [],
- k: [],
- }
- inds = {_['nameCn']: _['code'] for _ in data['data']['inds']}
- for j in data['data']['rankings']:
- items[f"{year}排名"].append(j['ranking'])
- items['学校名称'].append(j['univNameCn'])
- items['国家/地区'].append(j['region'])
- items['总分'].append(j['score'])
- if k in inds:
- items[k].append(j['indData'].get(inds[k], "0"))
- else:
- items[k].append('')
- return items
-
-
- def get_bcmr_name():
- # 获取中国大学专业排名的专业编码及名称
- url = "https://www.shanghairanking.cn/rankings/bcmr/2021"
- payload_js = get_payload_js(url)
- if not payload_js:
- return None
- params = get_params(payload_js)
- data = re.findall('name=(.*?);(.*?)code=(.*?);', payload_js, re.S)
- id_items = []
- for _ in data:
- name = _[0]
- if name in params:
- name = params[name]
- name = name.replace('"', "")
-
- _id = _[-1]
- if _id in params:
- _id = params[_id]
- _id = _id.replace('"', "")
-
- if len(_id) <= 4:
- continue
- id_items.append({"id": _id, "name": name})
- return id_items
-
-
- def parse(p):
- p = p.replace('"', "")
- if p == "null":
- return None
- if p == "false":
- return False
- return p
-
-
- # 将js里面的null及false转为python里面的None,False,pyhon里面必须大写,将''去除
-
-
- def get_data(s, re_t, params):
- k = re.findall(re_t, s, re.S)[0]
- if isinstance(k, tuple):
- k = k[-1]
- if k in params:
- k = params[k]
- else:
- k = k.replace('"', "")
- return k
-
-
- def get_params(payload_js):
- params1 = re.findall('function\((.*?)\)', payload_js, re.S)[0].split(",")
- params1 = [x.strip() for x in params1] # 参数列表
- params2 = [parse(_) for _ in
- re.findall("""}\((.*?)\)""", payload_js, re.S)[0].replace("\n", "").replace('"2021,2020"',
- '"2021|2020"').split(
- ",")] # 值的列表
- # 映射关系
- params = {x: y for x, y in zip(params1, params2)}
- return params
-
-
- # 获取参数对应的payload_js数据,
-
-
- def get_bcmr_data(id_s):
- # 获取中国大学专业排名排名的学科编码及名称对应的排行榜
- url = f"https://www.shanghairanking.cn/api/pub/v1/bcmr/rank?year=2021&majorCode={id_s}"
- resp = requests.get(url, headers=headers)
- data = resp.json()
- items = {
- "评级": [],
- "排名": [],
- "学校名称": [],
- "省市": [],
- "总分": [],
-
- }
-
- # inds = {_['nameCn']: _['code'] for _ in data['data']['inds']}
- for j in data['data']['rankings']:
- items["评级"].append(j['grade'])
- items["排名"].append(j['ranking'])
- items['学校名称'].append(j['univNameCn'])
- city = j['city'] if j['city'] else j['province']
- items['省市'].append(city)
-
- # items['省市'].append(j['province'])
-
- items['总分'].append(j['score'])
- return items
-
-
- def mune():
- print("RANKINGS爬虫启动")
- pid = os.getpid()
- print("pid:", pid)
- with open("./data/RANKINGS_pid.txt", "w") as f:
- f.write(str(pid))
- begin_year = 2003
- while begin_year <= 2022:
- # 总目录下如果没有年份子目录,或者年份子目录不在总目录下则在总目录下建立一个年份子目录
- if begin_year >= 2015:
- s1_d = os.path.join(s1, str(begin_year))
- if not os.path.exists(s1_d):
- os.makedirs(s1_d)
- ch1 = get_bcur_data(begin_year)
- if ch1:
- file_name = os.path.join(s1_d, "中国大学排名.csv")
- # 在年份子目录下建立对应排名文件
- pd.DataFrame(ch1).to_csv(file_name, index=False)
- # 写入数据
-
- if 2011 <= begin_year <= 2020:
- s6_d = os.path.join(s6, str(begin_year))
- if not os.path.exists(s6_d):
- os.makedirs(s6_d)
- c2 = get_rtugc_data(begin_year)
- if c2:
- file_name = os.path.join(s6_d, "中国两岸四地大学排名.csv")
- pd.DataFrame(c2).to_csv(file_name, index=False)
-
- if begin_year in [2016, 2017, 2018, 2020, 2021]:
- s7_d = os.path.join(s7, str(begin_year))
- if not os.path.exists(s7_d):
- os.makedirs(s7_d)
- c3 = get_grsssd_data(begin_year)
- if c3:
- file_name = os.path.join(s7_d, "全球体育类院系学术排名.csv")
- pd.DataFrame(c3).to_csv(file_name, index=False)
-
- if 2017 <= begin_year <= 2021:
- s2_d = os.path.join(s2, str(begin_year))
- if not os.path.exists(s2_d):
- os.makedirs(s2_d)
- for rk in get_bcsr_name(begin_year): # 中国最好学科排名
- c4 = get_bcsr_data(rk['id'], begin_year)
- if c4:
- file_name = os.path.join(s2_d, rk['name'] + ".csv")
- pd.DataFrame(c4).to_csv(file_name, index=False)
-
- s5_d = os.path.join(s5, str(begin_year))
- if not os.path.exists(s5_d):
- os.makedirs(s5_d)
- for gr in get_gras_name(begin_year): # 世界一流学科排名
- c5 = get_gras_data(gr['id'], begin_year)
- if c5:
- file_name = os.path.join(s5_d, gr['name'] + ".csv")
- pd.DataFrame(c5).to_csv(file_name, index=False)
-
- if begin_year in [2021, 2022]:
- print(begin_year)
- s3_d = os.path.join(s3, str(begin_year))
- if not os.path.exists(s3_d):
- os.makedirs(s3_d)
- for zr in get_bcmr_name(): # 中国大学专业排名
- c6 = get_bcmr_data(zr['id'])
- if c6:
- file_name = os.path.join(s3_d, zr['name'] + ".csv")
- pd.DataFrame(c6).to_csv(file_name, index=False)
- if 2003 <= begin_year <= 2021:
- s4_d = os.path.join(s4, str(begin_year))
- if not os.path.exists(s4_d):
- os.makedirs(s4_d)
- c7 = get_arwu_data(str(begin_year))
- file_name = os.path.join(s4_d, "世界大学学术排名.csv")
- pd.DataFrame(c7).to_csv(file_name, index=False)
- begin_year += 1
-
-
- if __name__ == '__main__':
- begin_year = 2003
- while begin_year <= 2022:
- # 总目录下如果没有年份子目录,或者年份子目录不在总目录下则在总目录下建立一个年份子目录
- if begin_year >= 2015:
- s1_d = os.path.join(s1, str(begin_year))
- if not os.path.exists(s1_d):
- os.makedirs(s1_d)
- ch1 = get_bcur_data(begin_year)
- if ch1:
- file_name = os.path.join(s1_d, "中国大学排名.csv")
- # 在年份子目录下建立对应排名文件
- pd.DataFrame(ch1).to_csv(file_name, index=False)
- # 写入数据
-
- if 2011 <= begin_year <= 2020:
- s6_d = os.path.join(s6, str(begin_year))
- if not os.path.exists(s6_d):
- os.makedirs(s6_d)
- c2 = get_rtugc_data(begin_year)
- if c2:
- file_name = os.path.join(s6_d, "中国两岸四地大学排名.csv")
- pd.DataFrame(c2).to_csv(file_name, index=False)
-
- if begin_year in [2016, 2017, 2018, 2020, 2021]:
- s7_d = os.path.join(s7, str(begin_year))
- if not os.path.exists(s7_d):
- os.makedirs(s7_d)
- c3 = get_grsssd_data(begin_year)
- if c3:
- file_name = os.path.join(s7_d, "全球体育类院系学术排名.csv")
- pd.DataFrame(c3).to_csv(file_name, index=False)
-
- if 2017 <= begin_year <= 2021:
- s2_d = os.path.join(s2, str(begin_year))
- if not os.path.exists(s2_d):
- os.makedirs(s2_d)
- for rk in get_bcsr_name(begin_year): # 中国最好学科排名
- c4 = get_bcsr_data(rk['id'], begin_year)
- if c4:
- file_name = os.path.join(s2_d, rk['name'] + ".csv")
- pd.DataFrame(c4).to_csv(file_name, index=False)
-
- s5_d = os.path.join(s5, str(begin_year))
- if not os.path.exists(s5_d):
- os.makedirs(s5_d)
- for gr in get_gras_name(begin_year): # 世界一流学科排名
- c5 = get_gras_data(gr['id'], begin_year)
- if c5:
- file_name = os.path.join(s5_d, gr['name'] + ".csv")
- pd.DataFrame(c5).to_csv(file_name, index=False)
-
- if begin_year in [2021,2022]:
- print(begin_year)
- s3_d = os.path.join(s3, str(begin_year))
- if not os.path.exists(s3_d):
- os.makedirs(s3_d)
- for zr in get_bcmr_name(): # 中国大学专业排名
- c6 = get_bcmr_data(zr['id'])
- if c6:
- file_name = os.path.join(s3_d, zr['name'] + ".csv")
- pd.DataFrame(c6).to_csv(file_name, index=False)
- if 2003 <= begin_year <= 2021:
- s4_d = os.path.join(s4, str(begin_year))
- if not os.path.exists(s4_d):
- os.makedirs(s4_d)
- c7 = get_arwu_data(str(begin_year))
- file_name = os.path.join(s4_d, "世界大学学术排名.csv")
- pd.DataFrame(c7).to_csv(file_name, index=False)
- begin_year += 1
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。