赞
踩
因为知乎爬虫大部分需要登录,有时使用selenium无法访问页面,先爬取了一个有关山东大学话题的网页。主要包括了具体链接,话题分类和具体话题。
链接如下:https://www.zhihu.com/topic/19864829/index
import time from selenium import webdriver import warnings warnings.filterwarnings("ignore") url='https://www.zhihu.com/topic/19864829/index' driver=webdriver.Chrome() driver.get(url) time.sleep(5) #print("获取网页") f=open("zhihu_urls.csv",'a',encoding='utf-8') ls=['链接','题目','话题'] f.write(",".join(ls)+"\n") for topicmodule in driver.find_element_by_class_name("TopicIndex-contentMain").find_elements_by_class_name("TopicIndexModule"): #话题分类 topic=topicmodule.find_element_by_class_name("TopicIndexModule-title").text for item in topicmodule.find_elements_by_class_name("TopicIndexModule-item"): topic_info=[] href=item.find_element_by_tag_name("a").get_attribute("href") #链接 title=item.find_element_by_tag_name("a").text #具体话题 topic_info.append(href) topic_info.append(title) topic_info.append(topic) print(topic_info) f.write(",".join(topic_info) + "\n") f.close() driver.quit() print("爬取知乎话题成功")
csv内容:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。