当前位置:   article > 正文

python 登录新浪微博爬取粉丝信息

python 登录新浪微博爬取粉丝信息

最近有个小需求,爬取新浪微博的粉丝信息,弄了好几天,终于搞定,送上代码:

环境:

系统:windows 7

版本:python 3.3

IDE: PyCharm 4.0.4

参考:http://blog.csdn.net/crystal_zero/article/details/51154632

  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import time
  4. import base64
  5. import rsa
  6. import binascii
  7. import requests
  8. import re
  9. import random
  10. try:
  11. import cookielib
  12. except:
  13. import http.cookiejar as cookielib
  14. try:
  15. from PIL import Image
  16. except:
  17. pass
  18. try:
  19. from urllib.parse import quote_plus
  20. except:
  21. from urllib import quote_plus
  22. '''
  23. 如果没有开启登录保护,不用输入验证码就可以登录
  24. 如果开启登录保护,需要输入验证码
  25. '''
  26. # 构造 Request headers
  27. agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0'
  28. headers = {
  29. # "Host": "www.weibo.com",
  30. 'User-Agent': agent
  31. }
  32. session = requests.session()
  33. session.cookies = cookielib.LWPCookieJar(filename='cookies')
  34. try:
  35. session.cookies.load(ignore_discard=True)
  36. except:
  37. print("Cookie 未能加载")
  38. # 访问 初始页面带上 cookie
  39. index_url = "http://weibo.com/login.php"
  40. try:
  41. session.get(index_url, headers=headers, timeout=2)
  42. except:
  43. session.get(index_url, headers=headers)
  44. try:
  45. input = raw_input
  46. except:
  47. pass
  48. def get_su(username):
  49. """
  50. 对 email 地址和手机号码 先 javascript 中 encodeURIComponent
  51. 对应 Python 3 中的是 urllib.parse.quote_plus
  52. 然后在 base64 加密后decode
  53. """
  54. username_quote = quote_plus(username)
  55. username_base64 = base64.b64encode(username_quote.encode("utf-8"))
  56. return username_base64.decode("utf-8")
  57. # 预登陆获得 servertime, nonce, pubkey, rsakv
  58. def get_server_data(su):
  59. pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su="
  60. pre_url = pre_url + su + "&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_="
  61. pre_url = pre_url + str(int(time.time() * 1000))
  62. pre_data_res = session.get(pre_url, headers=headers)
  63. sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", ''))
  64. return sever_data
  65. # print(sever_data)
  66. def get_password(password, servertime, nonce, pubkey):
  67. rsaPublickey = int(pubkey, 16)
  68. key = rsa.PublicKey(rsaPublickey, 65537) # 创建公钥
  69. message = str(servertime) + '\t' + str(nonce) + '\n' + str(password) # 拼接明文js加密文件中得到
  70. message = message.encode("utf-8")
  71. passwd = rsa.encrypt(message, key) # 加密
  72. passwd = binascii.b2a_hex(passwd) # 将加密信息转换为16进制。
  73. return passwd
  74. def get_cha(pcid):
  75. cha_url = "http://login.sina.com.cn/cgi/pin.php?r="
  76. cha_url = cha_url + str(int(random.random() * 100000000)) + "&s=0&p="
  77. cha_url = cha_url + pcid
  78. cha_page = session.get(cha_url, headers=headers)
  79. with open("cha.jpg", 'wb') as f:
  80. f.write(cha_page.content)
  81. f.close()
  82. try:
  83. im = Image.open("cha.jpg")
  84. im.show()
  85. im.close()
  86. except:
  87. print(u"请到当前目录下,找到验证码后输入")
  88. def login(username, password):
  89. # su 是加密后的用户名
  90. su = get_su(username)
  91. sever_data = get_server_data(su)
  92. servertime = sever_data["servertime"]
  93. nonce = sever_data['nonce']
  94. rsakv = sever_data["rsakv"]
  95. pubkey = sever_data["pubkey"]
  96. showpin = sever_data["showpin"]
  97. password_secret = get_password(password, servertime, nonce, pubkey)
  98. postdata = {
  99. 'entry': 'weibo',
  100. 'gateway': '1',
  101. 'from': '',
  102. 'savestate': '7',
  103. 'useticket': '1',
  104. 'pagerefer': "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl",
  105. 'vsnf': '1',
  106. 'su': su,
  107. 'service': 'miniblog',
  108. 'servertime': servertime,
  109. 'nonce': nonce,
  110. 'pwencode': 'rsa2',
  111. 'rsakv': rsakv,
  112. 'sp': password_secret,
  113. 'sr': '1366*768',
  114. 'encoding': 'UTF-8',
  115. 'prelt': '115',
  116. 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
  117. 'returntype': 'META'
  118. }
  119. login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
  120. if showpin == 0:
  121. login_page = session.post(login_url, data=postdata, headers=headers)
  122. else:
  123. pcid = sever_data["pcid"]
  124. get_cha(pcid)
  125. postdata['door'] = input(u"请输入验证码")
  126. login_page = session.post(login_url, data=postdata, headers=headers)
  127. login_loop = (login_page.content.decode("GBK"))
  128. # print(login_loop)
  129. pa = r'location\.replace\([\'"](.*?)[\'"]\)'
  130. loop_url = re.findall(pa, login_loop)[0]
  131. # print(loop_url)
  132. # 此出还可以加上一个是否登录成功的判断,下次改进的时候写上
  133. login_index = session.get(loop_url, headers=headers)
  134. uuid = login_index.text
  135. uuid_pa = r'"uniqueid":"(.*?)"'
  136. uuid_res = re.findall(uuid_pa, uuid, re.S)[0]
  137. web_weibo_url = "http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1" % uuid_res
  138. weibo_page = session.get(web_weibo_url, headers=headers)
  139. weibo_pa = r'<title>(.*?)</title>'
  140. # print(weibo_page.content.decode("utf-8"))
  141. contents = re.findall(weibo_pa, weibo_page.content.decode("utf-8", 'ignore'), re.S)
  142. if len(contents):
  143. userID = contents[0]
  144. print(u"欢迎你 %s, 你在正在使用 xchaoinfo 写的模拟登录微博" % userID)
  145. session.cookies.save()
  146. return True
  147. else:
  148. return False
  149. def getWeiboPageContent(url):
  150. import requests
  151. from selenium import webdriver
  152. import time
  153. # webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Cookie'] = 'SINAGLOBAL=3955422793326.2764.1451802953297; wb_publish_vip_1888964862=2; YF-Page-G0=9a31b867b34a0b4839fa27a4ab6ec79f; _s_tentry=123.sogou.com; Apache=3233757158741.355.1460597853009; ULV=1460597853022:23:7:4:3233757158741.355.1460597853009:1460533608651; YF-V5-G0=24e0459613d3bbdec61239bc81c89e13; YF-Ugrow-G0=3a02f95fa8b3c9dc73c74bc9f2ca4fc6; login_sid_t=9c9ff740ffffbdf23415b871aa319ec0; TC-Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; TC-V5-G0=7e5b74ea4beaaa98b5f592db11c2eeb9; myuid=5898063885; TC-Page-G0=1e758cd0025b6b0d876f76c087f85f2c; wvr=6; SSOLoginState=1460604791; un=1654916845@qq.com; user_active=201604141220; user_unver=b37741d33507619aa07b6d512be0dd67; SUS=SID-5898063885-1460616073-XD-a0vlz-c7f218016d290c52b4297371d3942ce7; SUE=es%3D5ec8e6a39470b9d75e4cf930977a7449%26ev%3Dv1%26es2%3D2b500477fcc5dc1560b4aee4d5ca9be5%26rs0%3Daeb6LPcKR9mfwMB31yJquO6NSXgRd9BpSLAcKEdCQjtUOM7pIzDblRVIOOItI22Uc6pLjTxUc6k7s0zIgIxar9JlVdPjOiHEGijE2ucUP6GjFv%252BJWntWR7szF3qcWknPBJzemeHiThBGRF0bAcNFHcH%252BY9VfrTaIVjXCUsO93B8%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1460616073%26et%3D1460702473%26d%3Dc909%26i%3D2ce7%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D27%26st%3D0%26uid%3D5898063885%26name%3D1654916845%2540qq.com%26nick%3Dforfun2016%26fmp%3D%26lcp%3D; SUB=_2A256C0vZDeTxGeNG4loR9i3EwzmIHXVZYToRrDV8PUJbvNAPLXX2kW9LHet_3CJqcXTqa57ddP1X0MxCpd6bSA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhFdfTB9.lOPTcSpGQskYri5JpX5K-t; SUHB=0z1QUpglvHBm_i; ALF=1492143611; UOR=far.tooold.cn,widget.weibo.com,login.sina.com.cn; WBtopGlobal_register_version=ab9111fb56d70a2b'
  154. #这个地方可以设置 header, 代理等一些参数
  155. cap = webdriver.DesiredCapabilities.PHANTOMJS
  156. cap["phantomjs.page.settings.resourceTimeout"] = 1000
  157. cap["phantomjs.page.settings.loadImages"] = False
  158. cap["phantomjs.page.settings.disk-cache"] = True
  159. cap["phantomjs.page.customHeaders.Cookie"] = 'SINAGLOBAL=1090764576516.7528.1472895523482;' \
  160. 'ULV=1473054803412:3:3:2:8569250075753.363.1473054803408:1472968709618;' \
  161. 'SCF=AsEbWSFBj4yI1a200LCVvAmkggqyRCHJ705_J2slnQgOZiIr6H6PjN2HTWiP8y_wK4rtnl9XPpwDIkzi3Do0iHg.;' \
  162. 'SUHB=0wn1u-A905WJz_;' \
  163. 'un=18704027874;' \
  164. 'UOR=,,www.baidu.com;' \
  165. 'wvr=6;' \
  166. 'SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFwo9sl7cB8PE.FyBvPnqYc5JpX5K2hUgL.Foq7eKe0S0.7SKz2dJLoIEBLxK-LBo2LBK5LxK.L1KzL1-qLxK-LB.eL1heLxK-LBo2LBK5t;' \
  167. 'YF-Ugrow-G0=1eba44dbebf62c27ae66e16d40e02964;' \
  168. 'YF-V5-G0=5f9bd778c31f9e6f413e97a1d464047a;' \
  169. 'YF-Page-G0=8ec35b246bb5b68c13549804abd380dc;' \
  170. 'WBtopGlobal_register_version=370d73e0c771001f;' \
  171. 'login_sid_t=1a1f99aaff13878558a61d66c6b9c0a4;' \
  172. '_s_tentry=www.baidu.com;' \
  173. 'Apache=8569250075753.363.1473054803408;' \
  174. 'SSOLoginState=1473054827;' \
  175. 'SUB=_2A256yvvYDeTxGeBO6lES9yfMzj6IHXVZvmoQrDV8PUJbmtBeLWyjkW8OibLNYiqudnUDwBp6GgtGcUnFuQ..' #我删掉了一大部分
  176. driver = webdriver.PhantomJS(executable_path="D:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe", desired_capabilities=cap)
  177. driver.set_window_size(1120, 1100)
  178. # print(url)
  179. driver.get(url)
  180. page_source = driver.page_source
  181. # print(t)
  182. #
  183. # contents = driver.find_elements_by_tag_name("<a>")
  184. #
  185. # for content in contents:
  186. # print(content.text)
  187. return page_source
  188. from lib3 import spider
  189. from lib3 import fileWriter
  190. import os
  191. import sys
  192. class weiboSpider:
  193. def __init__(self,userAgent,userName,password):
  194. self.sp = spider.Spider(userAgent)
  195. self.loginStatus = login(userName, password)
  196. def go(self,url):
  197. status = True
  198. pageIndex = 1
  199. while status == True:
  200. fix = str(pageIndex)+"#Pl_Official_HisRelation__63"
  201. curUrl = url+fix
  202. print(curUrl)
  203. status = self.parse(curUrl)
  204. pageIndex += 1
  205. def cur_file_dir(self):
  206. #获取脚本路径
  207. path = sys.path[0]
  208. #判断为脚本文件还是py2exe编译后的文件,如果是脚本文件,则返回的是脚本的目录,如果是py2exe编译后的文件,则返回的是编译后的文件路径
  209. if os.path.isdir(path):
  210. return path
  211. elif os.path.isfile(path):
  212. return os.path.dirname(path)
  213. def saveData(self,fansUrl,fansId):
  214. fHandle = fileWriter.FileWriter(self.cur_file_dir(),'zhiHuData.txt','a')
  215. fHandle.write(fansUrl+'\t'+fansId+'\r\n')
  216. fHandle.close()
  217. def parse(self,url):
  218. if self.loginStatus == True:
  219. content = getWeiboPageContent(url)
  220. strPattern = '''<a class="S_txt1" target="_blank" usercard=".*?" href="(.*?)">(.*?)</a>'''
  221. resultsList = self.sp.parseReg(content,strPattern,2)
  222. if len(resultsList):
  223. for i in range(0,len(resultsList)):
  224. fansUrl = "http://wei.com"+resultsList[i][0]
  225. fansId = resultsList[i][1]
  226. self.saveData(fansUrl,fansId)
  227. print(fansUrl,fansId)
  228. return True
  229. else:
  230. print("no data")
  231. return False
  232. else:
  233. print("sorry,can not login weibo")
  234. if __name__ == "__main__":
  235. demo = weiboSpider('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko','在此填写用户名','密码')
  236. demo.go('http://weibo.com/p/1005053810677006/follow?relate=fans&page=')


上面引用的lib3是我自己写的一个http接口


fileWriter.py

  1. __author__ = 'zhengjinwei'
  2. #coding=utf-8
  3. import os
  4. class FileWriter:
  5. def __init__(self,fileDir,fileName,format):
  6. self.mkDir(fileDir)
  7. self.f = open(fileDir+u"/"+fileName,format)
  8. def mkDir(self,path):
  9. isExists = os.path.exists(path)
  10. if not isExists:
  11. os.makedirs(path)
  12. def write(self,contents):
  13. return self.f.write(contents)
  14. def close(self):
  15. self.f.close()


spider.py

  1. #coding:utf-8
  2. __author__ = 'zhengjinwei'
  3. import re
  4. import requests
  5. try:
  6. import cookielib
  7. except:
  8. import http.cookiejar as cookielib
  9. class Spider:
  10. def __init__(self,userAgent):
  11. self.user_agent = userAgent
  12. self.headers = {
  13. 'User-Agent' : self.user_agent
  14. }
  15. def getHttp(self,url,param=None):
  16. return requests.get(url,param,headers=self.headers)
  17. def postHttp(self,url,postData=None):
  18. return requests.post(url, data=postData,headers=self.headers)
  19. def sessionPostHttp(self,url,param=None):
  20. session = requests.session()
  21. session.cookies = cookielib.LWPCookieJar(filename='cookies')
  22. try:
  23. session.cookies.load(ignore_discard=True)
  24. print(url,param)
  25. return session.post(url,data=param,json=None,headers=self.headers)
  26. except:
  27. print("Cookie 未能加载")
  28. return None
  29. def sessionGetHttp(self,url,param=None):
  30. session = requests.session()
  31. session.cookies = cookielib.LWPCookieJar(filename='cookies')
  32. try:
  33. session.cookies.load(ignore_discard=True)
  34. return session.get(url, headers=self.headers, allow_redirects=False)
  35. except:
  36. print("Cookie 未能加载")
  37. return None
  38. def parseReg(self,content,strPattern,count):
  39. pattern = re.compile(strPattern,re.S)
  40. items = re.findall(pattern,content)
  41. contents=[]
  42. for item in items:
  43. temArr = []
  44. for i in range(0,count):
  45. temArr.append(item[i])
  46. contents.append(temArr)
  47. return contents
  48. def getContents(self,url,strPattern,count,method="get",param=None):
  49. page = ""
  50. if method == "get":
  51. page = self.getHttp(url,param).text
  52. else:
  53. page = self.postHttp(url,param).text
  54. pattern = re.compile(strPattern,re.S)
  55. items = re.findall(pattern,page)
  56. contents=[]
  57. for item in items:
  58. temArr = []
  59. for i in range(0,count):
  60. temArr.append(item[i])
  61. contents.append(temArr)
  62. return contents
  63. # demo = Spider("Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
  64. #
  65. # t = demo.getHttp("https://www.baidu.com/index.php?tn=02049043_23_pg")

效果图:



注意事项:

1,新浪微博本身有查看限制,只能查看到前5页的数据

2,爬虫工具可以使用火狐浏览器,装上fireDebug 插件用于拦截消息,获取cookie和其他参数


声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/article/detail/56786?site
推荐阅读
相关标签
  

闽ICP备14008679号