当前位置:   article > 正文

【Python爬虫系列】Python 爬取上海链家二手房数据_爬取链家网二手房数据

爬取链家网二手房数据


先上代码,后续再详细解释

Python 爬虫代码:

  1. # -*- coding:utf-8 -*-
  2. ############################################################################
  3. # 程序:上海链家网爬虫
  4. # 功能:抓取上海链家二手房在售、成交数据 ,大约各5万记录;小区2万多个
  5. # 创建时间:2016/11/10
  6. # 更新历史:2016/11/26
  7. # 2016.11.27:增加地铁找房;更新区域参数;拆分模块 ,以便于单独调用
  8. # 2016.12.06 加入多线程处理
  9. # 2016.12.28 增加按照面积、户型、总价等明细条件,以便扩大爬数范围
  10. # 使用库:requests、BeautifulSoup4、MySQLdb
  11. # 作者:yuzhucu
  12. #############################################################################
  13. import requests
  14. from bs4 import BeautifulSoup
  15. import time
  16. import MySQLdb
  17. import urllib
  18. import urllib2
  19. import json
  20. import cookielib
  21. import re
  22. import zlib
  23. from threading import Thread
  24. from Queue import Queue
  25. from time import sleep
  26. #登录,不登录不能爬取三个月之内的数据
  27. #import LianJiaLogIn
  28. # 获取当前时间
  29. def getCurrentTime():
  30. return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time()))
  31. def getFangCondition():
  32. result = []
  33. for a in range(1,9):#面积
  34. for l in range(1,7):#户型
  35. for p in range(1,9):#总价
  36. cond = {}
  37. cond['url']='a'+str(a)+'l'+str(l)+'p'+str(p)
  38. cond['a']='a'+str(a)
  39. cond['l']='l'+str(l)
  40. cond['p']='p'+str(p)
  41. #print cond['url']
  42. result.append(cond)
  43. return result
  44. def getFangTransCondition():
  45. result = []
  46. for a in range(1,9):#面积
  47. for l in range(1,7):#户型
  48. cond = {}
  49. cond['url']='a'+str(a)+'l'+str(l)
  50. cond['a']='a'+str(a)
  51. cond['l']='l'+str(l)
  52. #print cond['url']
  53. result.append(cond)
  54. return result
  55. def getURL(url, tries_num=50, sleep_time=0, time_out=10):
  56. headers = {'content-type': 'application/json',
  57. 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
  58. proxies ={ "http": "10.11.12.13:8080", "https": "http://10.11.12.13:8080" }#替换为可用代理IP
  59. sleep_time_p = sleep_time
  60. time_out_p = time_out
  61. tries_num_p = tries_num
  62. try:
  63. res = requests.Session()
  64. if isproxy==1:
  65. res = requests.get(url, headers=headers, timeout=time_out,proxies=proxies)
  66. else:
  67. res = requests.get(url, headers=headers, timeout=time_out)
  68. res.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常
  69. except requests.RequestException as e:
  70. sleep_time_p = sleep_time_p + 10
  71. time_out_p = time_out_p + 10
  72. tries_num_p = tries_num_p -1
  73. # 设置重试次数,最大timeout 时间和 最长休眠时间
  74. #print tries_num_p
  75. if tries_num_p >0 :
  76. time.sleep(sleep_time_p)
  77. print getCurrentTime(), url, 'URL Connection Error: 第', max_retry- tries_num_p, u'次 Retry Connection', e
  78. res = getURL(url, tries_num_p, sleep_time_p, time_out_p)
  79. if res.status_code == 200:
  80. print getCurrentTime(), url, 'URL Connection Success: 共尝试', max_retry- tries_num_p, u'次', ',sleep_time:', sleep_time_p, ',time_out:', time_out_p
  81. else:
  82. print getCurrentTime(), url, 'URL Connection Error: 共尝试', max_retry- tries_num_p, u'次', ',sleep_time:', sleep_time_p, ',time_out:', time_out_p
  83. pass
  84. return res
  85. def getXiaoquList(fang_url):
  86. result = {}
  87. base_url = 'http://sh.lianjia.com'
  88. # res=requests.get(fang_url)
  89. res = getURL(fang_url)
  90. res.encoding = 'utf-8'
  91. soup = BeautifulSoup(res.text, 'html.parser')
  92. for fang in soup.select('.info-panel'):
  93. if (len(fang) > 0):
  94. try:
  95. result['xiaoqu_key'] = fang.select('h2')[0].a['key'].strip().lstrip().strip(" ")
  96. result['xiaoqu_name'] = fang.select('h2')[0].text.strip()
  97. result['xiaoqu_url'] = base_url + fang.select('h2')[0].a['href'].strip()
  98. result['quyu'] = fang.select('.con')[0].contents[1].text.strip()
  99. result['bankuai'] = fang.select('.con')[0].contents[3].text.strip()
  100. result['price'] = fang.select('.price')[0].span.text.strip() + fang.select('.price')[0].contents[2].strip()
  101. result['age'] = ''
  102. result['subway'] = ''
  103. result['onsale_num'] = ''
  104. result['fang_url'] = ''
  105. if len(fang.select('.con')[0].contents) >= 5:
  106. result['age'] = fang.select('.con')[0].contents[-1].string.strip()
  107. if len(fang.select('.fang-subway-ex')) > 0:
  108. result['subway'] = fang.select('.fang-subway-ex')[0].text.strip()
  109. if len(fang.select('.square')) > 0:
  110. result['onsale_num'] = fang.select('.square')[0].a.text.strip()
  111. if len(fang.select('.square')) > 0:
  112. result['fang_url'] = base_url + fang.select('.square')[0].a['href'].strip()
  113. getLianjiaList(result['fang_url'])
  114. result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
  115. mySQL.insertData('lianjia_fang_xiaoqu', result)
  116. print getCurrentTime(), u'小区:', result['xiaoqu_key'], result['xiaoqu_name'], result['age'], result[ 'quyu'],result['bankuai'], \
  117. result['subway'], result['xiaoqu_url'], result['price'], result['onsale_num'], result['fang_url']
  118. getLianjiaList(result['fang_url'])
  119. except Exception, e:
  120. print getCurrentTime(), u"Exception:%d: %s" % (e.args[0], e.args[1])
  121. return result
  122. def getLianjiaList(fang_url):
  123. result = {}
  124. base_url = 'http://sh.lianjia.com'
  125. # res=requests.get(fang_url)
  126. res = getURL(fang_url)
  127. res.encoding = 'utf-8'
  128. soup = BeautifulSoup(res.text, 'html.parser')
  129. for fang in soup.select('.info-panel'):
  130. if (len(fang) > 0):
  131. result['fang_key'] = fang.select('h2')[0].a['key'].strip()
  132. result['fang_desc'] = fang.select('h2')[0].text.strip()
  133. result['fang_url'] = base_url + fang.select('h2')[0].a['href'].strip()
  134. result['price'] = fang.select('.price')[0].text.strip()
  135. result['price_pre'] = fang.select('.price-pre')[0].text.strip()
  136. result['xiaoqu
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/422064
推荐阅读
相关标签
  

闽ICP备14008679号