赞
踩
实现python爬取房天下网站所有城市的二手房信息
import csv
with open("11.csv","w") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["a","b","c"])
writer.writerows([[1,1,1],[2,2,2],[3,3,3]])
要求爬取房天下各大城市的二手房信息(www.fang.com)
需爬取所有城市的二手房信息,并存在csv文件中,可以所有数据放在一个文件中,但要记录是哪个省,哪个城市。也可以每个城市的数据放在一个csv文件中。要求爬取每个房源信息包括标题、面积、价格、地址等信息。
获取网址并解析
def response(url, headers):
html = requests.get(url=url, headers=headers)
html.encoding = html.apparent_encoding
return html.text
解析该网页代码,获取各城市名及链接,并存到列表
# -*- coding:utf-8 -*- import requests from lxml import etree import re import csv from bs4 import BeautifulSoup from pyasn1.compat.octets import null headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36' } def response(url, headers): html = requests.get(url=url, headers=headers) html.encoding = html.apparent_encoding return html.text def crawl(url, write, headers): html = response(url,headers) soup = BeautifulSoup(html, 'lxml') titles = [] # 存放所有房源标题的列表 house_types = [] # 存放所有房源房型的列表 sizes = [] # 存放所有房源面积的列表 floors = [] # 存放所有房源楼层的列表 orientations = [] # 存放所有房源朝向的列表 addrs = [] # 存放所有房源地址的列表 totals = [] # 存放所有房源总价的列表 prices = [] # 存放所有房源单价的列表 items1 = soup.find_all('span', class_="tit_shop") for item in items1: titles.append(item.string.split()[0]) items2 = soup.find_all('p', class_="tel_shop") for item in items2: house_types.append(item.contents[0].split()[0]) sizes.append(item.contents[2].split()[0]) floors.append(item.contents[4].split()[0]) orientations.append(item.contents[6].split()[0]) items4 = soup.find_all('p', class_="add_shop") for item in items4: addrs.append(item.contents[3].string) items5 = soup.find_all('dd', class_="price_right") for item in items5: totals.append(item.contents[1].contents[1].string) prices.append(item.contents[3].string) for i in range(len(titles)): write.writerow([titles[i], house_types[i], sizes[i], floors[i], orientations[i], addrs[i], totals[i],prices[i]]) def crawlCity(url2,headers,address_list,hrefs): html2 = response(url2,headers) soup = BeautifulSoup(html2, 'lxml') items = soup.find_all('a', class_="red") for item in items: address_list.append(item.string) hrefs.append(item['href']) def crawlPage(url,headers): html = response(url, headers) items = re.findall("共(.*)页",html) if(len(items)==0): return 0 else: for item in items: return item def main(): totalpage = 0 address_list = [] hrefs = [] url2 = 'https://gz.esf.fang.com/newsecond/esfcities.aspx' crawlCity(url2, headers,address_list,hrefs) key = ['标题', '户型', '面积', '楼层', '朝向', '地址', '总价/万', '单位价格'] # ,'总价','单位价格'] for i in range(len(address_list)): with open('{}.csv'.format(address_list[i]), 'a', newline='', encoding='utf-8') as fp: write = csv.writer(fp) write.writerow(key) print('现在爬取%s的二手房信息' % address_list[i]) pageurl = "http:"+hrefs[i] if(crawlPage(pageurl,headers)==0): print("该城市无房源信息\n") continue else: totalpage=int(crawlPage(pageurl,headers)) for page in range(1, totalpage+1): pages = (str)(page + 30) new_url = "http:"+hrefs[i]+"/?i="+pages crawl(new_url, write, headers) print('第%s页爬取完成' % page) print('已完成%s爬取' % address_list[i]) print('\n') if __name__ == '__main__': main()
本人原创,欢迎转载
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。