赞
踩
QS世界大学排名有两个官方的网站,一个是国际网站,另一个是国内网站,上面的数据应该都是一样的,只是一个是英文,一个是中文。
qsChina,也就是国内网站上的QS排名数据获取相对比较方便,因为它可以选择每页显示所有数据,而国际网站上的那个则每页最多100条数据。虽然有这样的问题,但我最后还是选择国际网站上的数据,因为有些大学的中文翻译很奇怪。
我主要采用的还是selenium,这个虽然很慢,但是比较稳定。
#encoding=utf-8 from selenium.webdriver import Edge from selenium.webdriver.common.by import By import time import xlsxwriter Workbook = xlsxwriter.Workbook("QSRank2022.xlsx") Sheet = Workbook.add_worksheet() driver = Edge() Sheet.write(0, 0, 'Rank') Sheet.write(0, 1, 'University') Sheet.write(0, 2, 'Location') Sheet.write(0, 3, 'Overall Score') Sheet.write(0, 4, 'International Students Ratio') Sheet.write(0, 5, 'International Faculty Ratio') Sheet.write(0, 6, 'Faculty Student Ratio') Sheet.write(0, 7, 'Citations per Faculty') Sheet.write(0, 8, 'Academic Reputation') Sheet.write(0, 9, 'Employer Reputation') curl = 'https://www.topuniversities.com/university-rankings/world-university-rankings/2022' driver.get(curl) time.sleep(1) currentRow = 1 for j in range(13):#13 page for i in range(1, 103, 1): subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']') attr = subItem.get_attribute('customblock')#exclude ad if not attr: subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[1]/div/div/div/div/div[1]') Sheet.write(currentRow, 0, subItem.text) subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[1]') Sheet.write(currentRow, 1, subItem.text) subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[2]') Sheet.write(currentRow, 2, subItem.text) for k in range(3,10,1): subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[2]/div/div/div/div['+str(k-2)+']') Sheet.write(currentRow, k, subItem.text) currentRow = currentRow + 1 print(str(currentRow) + ' finished!') # change to next page if j < 12: q = 3 while True: try: nextPage = driver.find_element(By.XPATH, '//*[@id="alt-style-pagination"]/li['+str(q)+']/a') except: q = q + 1 continue attr = nextPage.get_attribute('class') if attr == 'page-link next': break q = q + 1 driver.execute_script('arguments[0].click();', nextPage) time.sleep(1) print('chaneg to page ' + str(j+2)) Workbook.close() driver.close()
上面的代码用于获取综合排名。当时主要遇到了这样一些问题:
问题解决:
总共有56个学科排名,5个是"BROAD SUBJECT AREA",还有51个是"SPECIFIC SUBJECT",下面是代码的实现。每次启动的时候都需要在浏览器打开网页之后手动重新输入网页地址,不然会跳转到qsChina的网页
打开网页后先要选Subject,也是利用selenium执行js脚本的方式模拟浏览器点击。先打开下拉框,再选择相应的Subject,然后根据选择的Subject创建Excel表格。接着在网页下方可以获取这个Subject的条目数量。然后切换到具体的指标页面,改变每页显示的条目数量(尽可能大,可以少翻页)。再之后就可以跟综合排名一样获取具体的数据了。每遍历一页之后要翻页。
#encoding=utf-8 from selenium.webdriver import Edge from selenium.webdriver.common.by import By import time import xlsxwriter import math driver = Edge() curl = 'https://www.topuniversities.com/university-rankings/university-subject-rankings/2021/arts-humanities' driver.get(curl) time.sleep(1) for i in range(1, 59, 1): # skip the border. "1" for "BROAD SUBJECT AREA"; "7" for "SPECIFIC SUBJECT" if i == 1 or i == 7: continue #change subject SubjectSel = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div') driver.execute_script('arguments[0].click();', SubjectSel) time.sleep(1) Subject = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div/div[2]/div['+str(i)+']') SubjectName = Subject.text driver.execute_script('arguments[0].click();', Subject) print('Select Subject: '+SubjectName) time.sleep(1) # create sheet Workbook = xlsxwriter.Workbook(SubjectName+'.xlsx') Sheet = Workbook.add_worksheet() # get item total number itemNumber = driver.find_element(By.XPATH, '//*[@id="_totalcountresults"]') itemNum = int(itemNumber.text) print('Total Item count in ' + SubjectName + ': ' + itemNumber.text) # change tab to rank index rankInd = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[1]/div/div/ul/li[2]/a') driver.execute_script('arguments[0].click();', rankInd) time.sleep(1) print('Change tab to Ranking Indicators') # change items number in every page dropdown = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/i') driver.execute_script('arguments[0].click();', dropdown) time.sleep(1) itemsPerPage = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/div[2]/div[4]') driver.execute_script('arguments[0].click();', itemsPerPage) time.sleep(1) print('Now there are 100 items in every page') #initial the table head Sheet.write(0, 0, 'Rank') Sheet.write(0, 1, 'University') Sheet.write(0, 2, 'Location') Sheet.write(0, 3, 'Overall Score') Sheet.write(0, 4, 'H-index Citations') Sheet.write(0, 5, 'Citations per Paper') Sheet.write(0, 6, 'Academic Reputation') Sheet.write(0, 7, 'Employer Reputation') CycleCnt = int(math.ceil(itemNum/100)) currentRow = 1 for j in range(CycleCnt): k = 1 while True: try: eachItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']') except: break attr = eachItem.get_attribute('customblock')#exclude ad time.sleep(0.5) if not attr: subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[1]/div') Sheet.write(currentRow, 0, subItem.text) subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[1]/div') Sheet.write(currentRow, 1, subItem.text) subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[2]') Sheet.write(currentRow, 2, subItem.text) for q in range(3,8,1): subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[2]/div/div/div/div['+str(q-2)+']') Sheet.write(currentRow, q, subItem.text) print(str(currentRow)+ '/' + str(itemNum) + ' finished!') currentRow = currentRow + 1 k = k + 1 # next page if j < CycleCnt-1: q = 3 while True: try: nextPage = driver.find_element(By.XPATH, '//*[@id="alt-style-pagination"]/li['+str(q)+']/a') except: q = q + 1 continue attr = nextPage.get_attribute('class') if attr == 'page-link next': break q = q + 1 driver.execute_script('arguments[0].click();', nextPage) time.sleep(1) print('chaneg to page ' + str(j+2)) print('finish ' + SubjectName) Workbook.close() driver.close()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。