赞
踩
本篇内容:
1.提取知乎可用的请求信息。
2.爬取单个知乎用户的关注者列表。
1.提取知乎可用的请求信息。
问题:爬虫访问知乎网站时,被检测出非人行为,强行跳转到另一个非相关网页,
解决方法:修改header信息,根据实际浏览器信息修改。
这个是我们爬取的第一个用户主页。
左下角可以看到他的关注者有769人,点击关注者就能看到更详细的用户信息。
扬州田鸡等用户的url,可以通过此方法爬取,之后只要重复上述步骤就能完成爬取用户工作。
通过对网页源代码进行分析就可以知道,我们需要的url 分别在这两个网页的这两个标签中。
bs.find('div', class_='NumberBoard FollowshipCard-counts NumberBoard--divider').find_all('a')
bs.find('div',class_='List').find_all('a',class_='UserLink-link')
之后,就按计划完善即可。
图中遇到的问题就是被强制跳转到其他网页了。
你需要一个更加真实的header。
使用chrome的工具对网页真实请求进行分析。
可以看到header的信息,把它作为爬虫的header,就能实现对知乎的访问了。
由于涉及个人信息安全,这里就不提供全部代码了。
有兴趣的同学可以,按下面的不完全代码进行修改,共同学习,共同进步。
其中并没有用代理,可以把代理语句删掉,或将这个header换为你的header。
# -*- coding: utf-8 -*- import sys sys.path.append('../lib') from lib_demo import get_proxie import requests from bs4 import BeautifulSoup import random import time import socket import http.client def get_content(url, data = None): timeout = random.choice(range(80,180)) while True: try: response = requests.get(url, headers=headers, timeout=timeout) status = response.status_code #response.encoding = "utf-8" break except socket.timeout as e: print("3:",e) time.sleep(random.choice(range(8,15))) except socket.error as e: print("4:",e) time.sleep(random.choice(range(20,60))) except http.client.BadStatusLine as e: print("S:",e) time.sleep(random.choice(range(30,80))) except http.client.IncompleteRead as e: print("6:",e) time.sleep(random.choice(range(5,15))) if status is 200: return response else: return None class zhihu: def __int__(self): self.name = 'zhihu' def get_following_url(self,url,proxies,headers): r = requests.get(url=url,proxies=proxies,headers=headers,timeout=5) bs = BeautifulSoup(r, 'lxml') print(bs) #-------------------------------启动——————————————————----------# zhihu = zhihu() headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Language': 'zh-CN,zh;q=0.9', "Host": "www.zhihu.com", 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'Connection': 'keep-alive', 'cache-control': 'max-age=0', 'Referer': 'https://www.zhihu.com/people/zyzszbr/followers', 'cookie': '_zap=f6c4b516-6cac-48ef-83c7-46ba84cbcbfe; d_c0="AIAodopidQ-PTgMCEpSuw3v7yhjEefO8xZo=|1558355606"; q_c1=4b532f6e8c7e4fd98a7ea00f39f81237|1558593988000|1558593988000; __utmv=51854390.000--|3=entry_date=20190523=1; __gads=ID=f740fdd2bde02063:T=1558594499:S=ALNI_MZcneIiTiysGo9r_bFFaZWNn_TWkw; _xsrf=LMIve1ljscXhnQIAGEMxGeYpmiIWgFjg; l_n_c=1; l_cap_id="NGU1NWZmY2ZjMWYyNDUwODg3NzI5N2M4M2JmMGE0YzM=|1559619078|a7d14859f5c06dbe32b50679f7d0a5b53b407e01"; r_cap_id="MWJiMzM3NzNjNzI2NDE3ZThlZDY0NWY2MDI4MWQxNjc=|1559619078|26cc6d6129d1db8995a7de9941534d5ad3855b45"; cap_id="YmJiZWEwMDgxMWI5NDU2OGI2ODViYWU0MzgyMDhhYTg=|1559619078|cc5b6616611cdf61087db69e90ef0e5d08f6f489"; n_c=1; __utma=51854390.1619808308.1558593999.1558658625.1559619089.3; __utmc=51854390; __utmz=51854390.1559619089.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; capsion_ticket="2|1:0|10:1559630389|14:capsion_ticket|44:YmY2ZDRhZTIwY2U2NDkxZGFlZmNhZTAxYWY4OGM4MWY=|e8cfd0ca21146781c5f3dff98686f9b0e4f9be22f8e16cbd687d80498732dc12"; z_c0="2|1:0|10:1559630417|4:z_c0|92:Mi4xTTFfY0FnQUFBQUFBZ0NoMmltSjFEeWNBQUFDRUFsVk5VWjhkWFFDY0Y3Z2NiOVRUQWdWNm5VTkUzOFdtNXl0MEFR|7f0dc5f0250ee08a2935c7febeabc7b9ef2b48e95b89dec5370c369a8c58d4fd"; tgw_l7_route=73af20938a97f63d9b695ad561c4c10c', 'x-ab-param': 'se_wannasearch=0;top_v_album=1;tp_discussion_feed_type_android=2;se_backsearch=0;top_ydyq=A;se_title_only=0;tp_top_sticky=0;se_timebox_num=3;top_ebook=0;tp_qa_toast=1;se_new_topic=0;soc_bignew=1;se_se_index=1;li_album_liutongab=0;se_agency= 0;top_hotcommerce=1;zr_ebook_chapter=0;ug_fw_answ_aut_1=0;ug_follow_answerer_0=0;se_movietab=1;top_test_4_liguangyi=1;top_universalebook=1;tsp_childbillboard=6;se_km_ad_locate=1;ug_zero_follow_0=0;li_se_ebook_chapter=1;se_websearch=3;top_gr_ab=2;se_webrs=1;se_colorfultab=0;soc_special=0;top_recall_exp_v2=8;qa_web_answerlist_ad=1;soc_bigone=0;soc_update=1;top_quality=0;ug_follow_topic_1=2;se_p_slideshow=0;se_auto_syn=0;tp_meta_card=0;tp_sticky_android=0;tsp_lastread=0;pf_fuceng=1;zr_album_exp=0;zr_km_xgb_model=new_xgb_6_4;se_lottery=0;se_ad_index=10;li_ts_sample=old;se_famous=1;se_payconsult=5;se_site_onebox=0;se_terminate=0;tp_m_intro_re_topic=1;ug_goodcomment_0=1;pf_creator_card=1;zr_infinity_rec_num=close;se_featured=1;se_webmajorob=0;se_webtimebox=1;se_zu_onebox=0;top_root=0;zr_ans_rec=gbrank;zr_art_rec=base;li_hot_score_ab=0;top_reason=1;top_vipconsume=4;ug_zero_follow=0;pf_noti_entry_num=0;se_spb309=0;se_whitelist=1;top_user_cluster=0;se_ri=0;li_tjys_ec_ab=0;top_new_feed=2;pf_foltopic_usernum=50;se_zu_recommend=0;tp_qa_metacard_top=top;ug_goodcomment=0;ls_fmp4=0;li_filter_ttl=2;se_amovietab=0;se_ios_spb309=1;top_recall_deep_user=1;tp_sft=a;tp_time_information=0;ug_newtag=1;li_album3_ab=0;qa_test=0;ug_follow_answerer=0;zr_km_answer=open_cvr;zr_video_rec_weight=open;li_price_test=1;li_se_intervene=1;se_time_threshold=1.5;zr_km_slot_style=event_card;zr_rel_search=base;li_mceb=1;se_search_feed=N;li_lt_tp_score=1;se_rr=1;tp_sft_v2= a;se_page_limit_20=1;top_recall_exp_v1=9;tp_qa_metacard=1;zr_se_footer=1;top_native_answer=6;li_ebook_detail=1;se_subtext=0;se_topicdirect=2;se_preset_tech=0;pf_newguide_vertical=0;se_billboardsearch=0;se_expired_ob=0;se_likebutton=0;li_qa_cover=new;qa_answerlist_ad=0;top_rank=3;tp_header_style=1;pf_feed=1', 'x-requested-with': 'fetch', 'x-zse-83': '3_2.0', 'x-zse-84': 'GLY0k6uqQ0tpbLYqmTH0gHXqH9YxHBx0GTF0Q7UBgh2p28xyM0tqker0SXSYo8tq' } url = 'https://www.zhihu.com/people/zyzszbr/activities/' #-------------------------------代理ip----------------------------------------------# getip = get_proxie() #getip.getip() #更新IP池 ip = getip.randomip() proxies = {"http": "http://" + ip, "https": "http://" + ip} # 代理ip r =get_content(url=url ) bs = BeautifulSoup(r.text, 'lxml') url_list = bs.find('div', class_='NumberBoard FollowshipCard-counts NumberBoard--divider').find_all('a') chapters_url = [] for one_list in url_list: chapters_url.append(one_list.get('href')) followers_url = 'https://www.zhihu.com'+ chapters_url[1] print(followers_url) r =get_content(url=followers_url) bs = BeautifulSoup(r.text, 'lxml') follower_url = bs.find('div',class_='List').find_all('a',class_='UserLink-link') print(follower_url) chapters_url = [] for one_list in follower_url: chapters_url.append(one_list.get('href')) print(chapters_url)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。