赞
踩
1、第一个代码,用于获取目标对象所有开源项目网址
# -*- coding: utf-8 -*- """ Created on Tue May 21 14:23:04 2019 @author: 慕蒿 @email: muhaocs@yeah.net """ import requests def getProject(url): response = requests.get(url) content = response.text content_ul = content.split('<div class="org-repos repo-list">')[1] content_li = content_ul.split('<ul>')[1].split('</ul>')[0] projects = content_li.split('<h3 class="wb-break-all">')[1:] page_info = [] for project in projects: project_info = [] project_name = project.split('>')[1].split('<')[0].strip() project_url = 'https://github.com' + project.split('href="')[1].split('"')[0].strip() project_info.append(project_name) project_info.append(project_url) try: project_language = project.split('"programmingLanguage">')[1].split('<')[0].strip() project_info.append(project_language) except: project_language = 'None' project_info.append(project_language) try: project_star = project.split('stargazers')[1].split('</span>')[1].split('</a>')[0].strip() project_info.append(project_star) except: project_star = 'None' project_info.append(project_star) try: project_memb = project.split('members')[1].split('</span>')[1].split('</a>')[0].strip() project_info.append(project_memb) except: project_memb = 'None' project_info.append(project_memb) page_info.append(project_info) return page_info fp = open('GitHub-google.txt', 'w+') # 生成一个txt文件,存贮五个变量:项目名,项目网址,项目脚本语言,项目加星情况,项目共享数 fp.write('project_name\tproject_url\tproject_language\tproject_star\tproject_memb\n') for page in range(41,51): # 此处更改为你目标对象,我先下载了google公司贡献的1774条开源项目 url = 'https://github.com/google?page=' + str(page) page_info = getProject(url) for li in page_info: fp.write(li[0]+'\t') fp.write(li[1]+'\t') fp.write(li[2]+'\t') fp.write(li[3]+'\t') fp.write(li[4]+'\n') print(page, 'Done!') fp.close()
2、将上述代码获取得到的txt用excel打开,我按加星情况降序下载的。复制上述项目网址,成为一个新的txt文件。txt文件内仅包含项目网址,如下所示:
https://github.com/google/physical-web
https://github.com/google/lovefield
https://github.com/google/blockly
https://github.com/google/go-cloud
https://github.com/google/android-classyshark
https://github.com/google/j2objc
https://github.com/google/flutter-desktop-embedding
https://github.com/google/closure-compiler
https://github.com/google/seesaw
https://github.com/google/seq2seq
https://github.com/google/error-prone
......
利用以下代码批量下载开源代码:
# -*- coding: utf-8 -*- """ Created on Tue May 21 14:23:04 2019 @author: 慕蒿 @email: muhaocs@yeah.net """ import requests import urllib3 from urllib import request def download(url, save_path): file_name = url.split('/')[-1] response = requests.get(url) content = response.text btn = content.split('btn btn-outline get-repo-btn js-anon-download-zip-link ')[1] zip_url = 'https://github.com' + btn.split('href="')[1].split('"')[0].strip() try: request.urlretrieve(zip_url, save_path + file_name + '.zip') except: try: with open(save_path + file_name + '.zip', 'wb') as code: code.write(requests.get(zip_url).content) except: try: http = urllib3.PoolManager() r = http.request('GET', zip_url) with open(save_path + file_name + '.zip', 'wb') as code: code.write(r.data) except: return file_name + ' 下载失败!!!!!!' return file_name + ' 下载完成!' save_path = "projects(google)/" fp = open('projects(google).txt', 'r') for line in fp.readlines(): url = line.strip() try: print(download(url, save_path)) except: print(url, '下载失败!!!!!!') fp.close()
3、下载成果如下所示:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。