赞
踩
最近笔者在学习爬虫,学的有些许懵逼,打算自己做一个简单的爬虫综合一下学习的知识,这个爬虫很简单,主要是爬取链家广州二手房的信息,并将相关信息进行整理,利用pandas库输入进我们的excel表格里,废话不多说,让我们一起来看看代码
import urllib from urllib.request import build_opener import urllib3 import json from http import cookiejar from urllib.request import HTTPCookieProcessor from urllib.request import install_opener import re import requests from bs4 import BeautifulSoup import pandas import numpy as np # 我们这里发现了链家翻页的时候,后面的pg会+1,所以我们会输入相应的页数来达到爬取多页面的行为 base_url = "https://gz.lianjia.com/ershoufang/pg{}/" # 构造headers,这些都是在chrome的开发者工具里面获取来的 headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'select_city=110000; all-lj=dafad6dd721afb903f2a315ab2f72633; lianjia_uuid=20e04e6a-43b9-4aff-92dd-da1a6ee8ac5d; TY_SESSION_ID=5c0be168-287f-4a70-81ef-27814df5724b; _smt_uid=5ca9b45b.c2b1829; sajssdk_2015_cross_new_user=1; _ga=GA1.2.1478954141.1554625630; _gid=GA1.2.1048010799.1554625630; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1554625628,1554626037; lianjia_ssid=4318f646-8d65-40f1-8079-6daa47bdd643; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22169f6e8889a2f9-09a6f2c0908db2-e323069-1327104-169f6e8889b7d4%22%2C%22%24device_id%22%3A%22169f6e8889a2f9-09a6f2c0908db2-e323069-1327104-169f6e8889b7d4%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1554626134', 'Host': 'bj.lianjia.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } # 我们传入参数page,指定爬虫的页数 def generate_url(page): base_url = "https://gz.lianjia.com/ershoufang/pg{}/" for url_next in range(1, page+1): yield base_url.format(url_next) # 构造一个生成器,生成多页面url # 访问我们的url,并正则表达式匹配到每一个住房的具体页面的url  # 访问的就是这些带有标题的页面 def getallurl(generate_url): html = requests.get(generate_url, headers=headers) if html.status_code == 200: re_set = re.compile('<a.*?class="noresultRecommend.*?img.*?".*?href="(.*?)"') re_get = re.findall(re_set, html.text) return re_get # 利用beautifsoup选择我们需要的数据,相关数字我们进行int转换,并以列表的形式添加进去 def open_url(re_get, info_index, info_title, info_price, info_unit): res = requests.get(re_get, headers=headers) if res.status_code == 200: info = {} soup = BeautifulSoup(res.text, 'lxml') info_index.append(int(float(soup.select('.total')[0].text))) info_title.append(soup.select('.main')[0].text) info_price.append(soup.select('.total')[0].text + '万') info_unit.append(soup.select('.unitPrice')[0].text) info_total = zip(info_index, info_title, info_price, info_unit ) # 为了返回结果方便,我先zip了这四个数据合并到一起,后续再unpack return info_total # 这个方法就是unpack def unzip_elements(info_elements): index, title, price, unit = [*zip(*info_elements)] index_list = list(index) title_list = list(title) price_list = list(price) unit_list = list(unit) return index_list, title_list, price_list, unit_list # 构造一个dataFrame来实现写入excel的操作 def save_to_xlsx(info): pdlook = pandas.DataFrame(info) pdlook.to_excel('链家.xlsx', sheet_name="链家二手房广州") # 主程序 if __name__ == '__main__': page = int(input("请输入你要的页数")) # 定义要存储数据的列表 info_index = [] info_title = [] info_price = [] info_unit = [] # 这里我用2个for循环实现获取页面url,获取每一个页面的28个具体住房信息url for i in generate_url(page): print(i) get = getallurl(i) print(len(get)) # 因为每个页面有28个结果 for i in range (1, 29): info_elements = open_url(get[i], info_index, info_title, info_price, info_unit) # 将数据unpack,分别存到这四个变量中 index_list, title_list, price_list, unit_list = unzip_elements(info_elements) # 将各个数据组合成一个字典 data = {'index': index_list, 'title':title_list, 'price':price_list, 'unit':unit_list} save_to_xlsx(data) print(data)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。