当前位置:   article > 正文

python 爬取了租房数据_info.find div

info.find div

 

爬取链接:https://sh.lianjia.com/zufang/

代码如下:

  1. import requests
  2. # 用于解析html数据的框架
  3. from bs4 import BeautifulSoup
  4. # 用于操作excel的框架
  5. from xlwt import *
  6. import json
  7. # 创建一个工作
  8. book = Workbook(encoding='utf-8');
  9. # 向表格中增加一个sheet表,sheet1为表格名称 允许单元格覆盖
  10. sheet = book.add_sheet('sheet1', cell_overwrite_ok=True)
  11. # 设置样式
  12. style = XFStyle();
  13. pattern = Pattern();
  14. pattern.pattern = Pattern.SOLID_PATTERN;
  15. pattern.pattern_fore_colour="0x00";
  16. style.pattern = pattern;
  17. # 设置列标题
  18. sheet.write(0, 0, "标题")
  19. sheet.write(0, 1, "地址")
  20. sheet.write(0, 2, "价格")
  21. sheet.write(0, 3, "建筑年代")
  22. sheet.write(0, 4, "满年限")
  23. sheet.write(0, 5, "离地铁")
  24. # 设置列宽度
  25. sheet.col(0).width = 0x0d00 + 200*50
  26. sheet.col(1).width = 0x0d00 + 20*50
  27. sheet.col(2).width = 0x0d00 + 10*50
  28. sheet.col(3).width = 0x0d00 + 120*50
  29. sheet.col(4).width = 0x0d00 + 1*50
  30. sheet.col(5).width = 0x0d00 + 50*50
  31. # 指定爬虫所需的上海各个区域名称
  32. citys = ['pudong', 'minhang', 'baoshan', 'xuhui', 'putuo', 'yangpu', 'changning', 'songjiang',
  33. 'jiading', 'huangpu', 'jinan', 'zhabei', 'hongkou', 'qingpu', 'fengxian', 'jinshan', 'chongming',
  34. 'shanghaizhoubian']
  35. def getHtml(city):
  36. url = 'http://sh.lianjia.com/ershoufang/%s/' % city
  37. headers = {
  38. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
  39. }
  40. request = requests.get(url=url, headers=headers)
  41. # 获取源码内容比request.text好,对编码方式优化好
  42. respons = request.content
  43. # 使用bs4模块,对响应的链接源代码进行html解析,后面是python内嵌的解释器,也可以安装使用lxml解析器
  44. soup = BeautifulSoup(respons, 'html.parser')
  45. # 获取类名为c-pagination的div标签,是一个列表
  46. pageDiv = soup.select('div .page-box')[0]
  47. pageData =dict(pageDiv.contents[0].attrs)['page-data'];
  48. pageDataObj =json.loads(pageData);
  49. totalPage =pageDataObj['totalPage']
  50. curPage =pageDataObj['curPage'];
  51. print(pageData);
  52. # 如果标签a标签数大于1,说明多页,取出最后的一个页码,也就是总页数
  53. for i in range(totalPage):
  54. pageIndex=i+1;
  55. print(city+"=========================================第 " + str(pageIndex) + " 页")
  56. print("\n")
  57. saveData(city, url, pageIndex);
  58. # 调用方法解析每页数据,并且保存到表格中
  59. def saveData(city, url, pageIndex):
  60. headers = {
  61. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
  62. }
  63. urlStr ='%spg%s' % (url, pageIndex);
  64. print(urlStr);
  65. html = requests.get(urlStr, headers=headers).content;
  66. soup = BeautifulSoup(html, 'lxml')
  67. liList = soup.findAll("li", {"class": "clear LOGCLICKDATA"})
  68. print(len(liList));
  69. index=0;
  70. for info in liList:
  71. title =info.find("div",class_="title").find("a").text;
  72. address =info.find("div",class_="address").find("a").text
  73. flood = info.find("div", class_="flood").text
  74. subway = info.find("div", class_="tag").findAll("span", {"class", "subway"});
  75. subway_col="";
  76. if len(subway) > 0:
  77. subway_col = subway[0].text;
  78. taxfree = info.find("div", class_="tag").findAll("span", {"class", "taxfree"});
  79. taxfree_col="";
  80. if len(taxfree) > 0:
  81. taxfree_col = taxfree[0].text;
  82. priceInfo =info.find("div",class_="priceInfo").find("div",class_="totalPrice").text;
  83. print(flood);
  84. global row
  85. sheet.write(row, 0, title)
  86. sheet.write(row, 1, address)
  87. sheet.write(row, 2, priceInfo)
  88. sheet.write(row, 3, flood)
  89. sheet.write(row, 4,taxfree_col)
  90. sheet.write(row, 5,subway_col)
  91. row+=1;
  92. index=row;
  93. # 判断当前运行的脚本是否是该脚本,如果是则执行
  94. # 如果有文件xxx继承该文件或导入该文件,那么运行xxx脚本的时候,这段代码将不会执行
  95. if __name__ == '__main__':
  96. # getHtml('jinshan')
  97. row=1
  98. for i in citys:
  99. getHtml(i)
  100. # 最后执行完了保存表格,参数为要保存的路径和文件名,如果不写路径则默然当前路径
  101. book.save('lianjia-shanghai.xls')

如下图:

思路是:

  • 先爬取每个区域的 url 和名称,跟主 url 拼接成一个完整的 url,循环 url 列表,依次爬取每个区域的租房信息。
  • 在爬每个区域的租房信息时,找到最大的页码,遍历页码,依次爬取每一页的二手房信息。

post 代码之前,先简单讲一下这里用到的几个爬虫 Python 包:

  • requests:是用来请求对链家网进行访问的包。
  • lxml:解析网页,用 Xpath 表达式与正则表达式一起来获取网页信息,相比 bs4 速度更快。

代码如下:

  1. import requests
  2. import time
  3. import re
  4. from lxml import etree
  5. # 获取某市区域的所有链接
  6. def get_areas(url):
  7. print('start grabing areas')
  8. headers = {
  9. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
  10. resposne = requests.get(url, headers=headers)
  11. content = etree.HTML(resposne.text)
  12. areas = content.xpath("//dd[@data-index = '0']//div[@class='option-list']/a/text()")
  13. areas_link = content.xpath("//dd[@data-index = '0']//div[@class='option-list']/a/@href")
  14. for i in range(1,len(areas)):
  15. area = areas[i]
  16. area_link = areas_link[i]
  17. link = 'https://bj.lianjia.com' + area_link
  18. print("开始抓取页面")
  19. get_pages(area, link)
  20. #通过获取某一区域的页数,来拼接某一页的链接
  21. def get_pages(area,area_link):
  22. headers = {
  23. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
  24. resposne = requests.get(area_link, headers=headers)
  25. pages = int(re.findall("page-data=\'{\"totalPage\":(\d+),\"curPage\"", resposne.text)[0])
  26. print("这个区域有" + str(pages) + "页")
  27. for page in range(1,pages+1):
  28. url = 'https://bj.lianjia.com/zufang/dongcheng/pg' + str(page)
  29. print("开始抓取" + str(page) +"的信息")
  30. get_house_info(area,url)
  31. #获取某一区域某一页的详细房租信息
  32. def get_house_info(area, url):
  33. headers = {
  34. 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
  35. time.sleep(2)
  36. try:
  37. resposne = requests.get(url, headers=headers)
  38. content = etree.HTML(resposne.text)
  39. info=[]
  40. for i in range(30):
  41. title = content.xpath("//div[@class='where']/a/span/text()")[i]
  42. room_type = content.xpath("//div[@class='where']/span[1]/span/text()")[i]
  43. square = re.findall("(\d+)",content.xpath("//div[@class='where']/span[2]/text()")[i])[0]
  44. position = content.xpath("//div[@class='where']/span[3]/text()")[i].replace(" ", "")
  45. try:
  46. detail_place = re.findall("([\u4E00-\u9FA5]+)租房", content.xpath("//div[@class='other']/div/a/text()")[i])[0]
  47. except Exception as e:
  48. detail_place = ""
  49. floor =re.findall("([\u4E00-\u9FA5]+)\(", content.xpath("//div[@class='other']/div/text()[1]")[i])[0]
  50. total_floor = re.findall("(\d+)",content.xpath("//div[@class='other']/div/text()[1]")[i])[0]
  51. try:
  52. house_year = re.findall("(\d+)",content.xpath("//div[@class='other']/div/text()[2]")[i])[0]
  53. except Exception as e:
  54. house_year = ""
  55. price = content.xpath("//div[@class='col-3']/div/span/text()")[i]
  56. with open('链家北京租房.txt','a',encoding='utf-8') as f:
  57. f.write(area + ',' + title + ',' + room_type + ',' + square + ',' +position+
  58. ','+ detail_place+','+floor+','+total_floor+','+price+','+house_year+'\n')
  59. print('writing work has done!continue the next page')
  60. except Exception as e:
  61. print( 'ooops! connecting error, retrying.....')
  62. time.sleep(20)
  63. return get_house_info(area, url)
  64. def main():
  65. print('start!')
  66. url = 'https://bj.lianjia.com/zufang'
  67. get_areas(url)
  68. if __name__ == '__main__':
  69. main()

 

由于每个楼盘户型差别较大,区域位置比较分散,每个楼盘具体情况还需具体分析

代码:

  1. #北京路段_房屋均价分布图
  2. detail_place = df.groupby(['detail_place'])
  3. house_com = detail_place['price'].agg(['mean','count'])
  4. house_com.reset_index(inplace=True)
  5. detail_place_main = house_com.sort_values('count',ascending=False)[0:20]
  6. attr = detail_place_main['detail_place']
  7. v1 = detail_place_main['count']
  8. v2 = detail_place_main['mean']
  9. line = Line("北京主要路段房租均价")
  10. line.add("路段",attr,v2,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
  11. mark_point=['min','max'],xaxis_interval=0,line_color='lightblue',
  12. line_width=4,mark_point_textcolor='black',mark_point_color='lightblue',
  13. is_splitline_show=False)
  14. bar = Bar("北京主要路段房屋数量")
  15. bar.add("路段",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
  16. xaxis_interval=0,is_splitline_show=False)
  17. overlap = Overlap()
  18. overlap.add(bar)
  19. overlap.add(line,yaxis_index=1,is_add_yaxis=True)
  20. overlap.render('北京路段_房屋均价分布图.html')

面积&租金分布呈阶梯性

  1. #房源价格区间分布图
  2. price_info = df[['area', 'price']]
  3. #对价格分区
  4. bins = [0,1000,1500,2000,2500,3000,4000,5000,6000,8000,10000]
  5. level = ['0-1000','1000-1500', '1500-2000', '2000-3000', '3000-4000', '4000-5000', '5000-6000', '6000-8000', '8000-1000','10000以上']
  6. price_stage = pd.cut(price_info['price'], bins = bins,labels = level).value_counts().sort_index()
  7. attr = price_stage.index
  8. v1 = price_stage.values
  9. bar = Bar("价格区间&房源数量分布")
  10. bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
  11. xaxis_interval=0,is_splitline_show=False)
  12. overlap = Overlap()
  13. overlap.add(bar)
  14. overlap.render('价格区间&房源数量分布.html')

  1. #房屋面积分布
  2. bins =[0,30,60,90,120,150,200,300,400,700]
  3. level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400+']
  4. df['square_level'] = pd.cut(df['square'],bins = bins,labels = level)
  5. df_digit= df[['area', 'room_type', 'square', 'position', 'total_floor', 'floor', 'house_year', 'price', 'square_level']]
  6. s = df_digit['square_level'].value_counts()
  7. attr = s.index
  8. v1 = s.values
  9. pie = Pie("房屋面积分布",title_pos='center')
  10. pie.add(
  11. "",
  12. attr,
  13. v1,
  14. radius=[40, 75],
  15. label_text_color=None,
  16. is_label_show=True,
  17. legend_orient="vertical",
  18. legend_pos="left",
  19. )
  20. overlap = Overlap()
  21. overlap.add(pie)
  22. overlap.render('房屋面积分布.html')
  23. #房屋面积&价位分布
  24. bins =[0,30,60,90,120,150,200,300,400,700]
  25. level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200-300','300-400','400+']
  26. df['square_level'] = pd.cut(df['square'],bins = bins,labels = level)
  27. df_digit= df[['area', 'room_type', 'square', 'position', 'total_floor', 'floor', 'house_year', 'price', 'square_level']]
  28. square = df_digit[['square_level','price']]
  29. prices = square.groupby('square_level').mean().reset_index()
  30. amount = square.groupby('square_level').count().reset_index()
  31. attr = prices['square_level']
  32. v1 = prices['price']
  33. pie = Bar("房屋面积&价位分布布")
  34. pie.add("", attr, v1, is_label_show=True)
  35. pie.render()
  36. bar = Bar("房屋面积&价位分布")
  37. bar.add("",attr,v1,is_stack=True,xaxis_rotate=30,yaxix_min=4.2,
  38. xaxis_interval=0,is_splitline_show=False)
  39. overlap = Overlap()
  40. overlap.add(bar)
  41. overlap.render('房屋面积&价位分布.html')

 

 

 

 

 

 

 

 

摘录:爬取了上万条租房数据,你还要不要北漂

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/天景科技苑/article/detail/939784
推荐阅读
相关标签
  

闽ICP备14008679号