赞
踩
先上代码,后续再详细解释
Python 爬虫代码:
- # -*- coding:utf-8 -*-
- ############################################################################
- # 程序:上海链家网爬虫
- # 功能:抓取上海链家二手房在售、成交数据 ,大约各5万记录;小区2万多个
- # 创建时间:2016/11/10
- # 更新历史:2016/11/26
- # 2016.11.27:增加地铁找房;更新区域参数;拆分模块 ,以便于单独调用
- # 2016.12.06 加入多线程处理
- # 2016.12.28 增加按照面积、户型、总价等明细条件,以便扩大爬数范围
- # 使用库:requests、BeautifulSoup4、MySQLdb
- # 作者:yuzhucu
- #############################################################################
- import requests
- from bs4 import BeautifulSoup
- import time
- import MySQLdb
- import urllib
- import urllib2
- import json
- import cookielib
- import re
- import zlib
- from threading import Thread
- from Queue import Queue
- from time import sleep
- #登录,不登录不能爬取三个月之内的数据
- #import LianJiaLogIn
- # 获取当前时间
-
- def getCurrentTime():
- return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time()))
-
- def getFangCondition():
- result = []
- for a in range(1,9):#面积
- for l in range(1,7):#户型
- for p in range(1,9):#总价
- cond = {}
- cond['url']='a'+str(a)+'l'+str(l)+'p'+str(p)
- cond['a']='a'+str(a)
- cond['l']='l'+str(l)
- cond['p']='p'+str(p)
- #print cond['url']
- result.append(cond)
- return result
-
- def getFangTransCondition():
- result = []
- for a in range(1,9):#面积
- for l in range(1,7):#户型
- cond = {}
- cond['url']='a'+str(a)+'l'+str(l)
- cond['a']='a'+str(a)
- cond['l']='l'+str(l)
- #print cond['url']
- result.append(cond)
- return result
-
-
- def getURL(url, tries_num=50, sleep_time=0, time_out=10):
- headers = {'content-type': 'application/json',
- 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
- proxies ={ "http": "10.11.12.13:8080", "https": "http://10.11.12.13:8080" }#替换为可用代理IP
- sleep_time_p = sleep_time
- time_out_p = time_out
- tries_num_p = tries_num
- try:
- res = requests.Session()
- if isproxy==1:
- res = requests.get(url, headers=headers, timeout=time_out,proxies=proxies)
- else:
- res = requests.get(url, headers=headers, timeout=time_out)
- res.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常
- except requests.RequestException as e:
- sleep_time_p = sleep_time_p + 10
- time_out_p = time_out_p + 10
- tries_num_p = tries_num_p -1
- # 设置重试次数,最大timeout 时间和 最长休眠时间
- #print tries_num_p
- if tries_num_p >0 :
- time.sleep(sleep_time_p)
- print getCurrentTime(), url, 'URL Connection Error: 第', max_retry- tries_num_p, u'次 Retry Connection', e
- res = getURL(url, tries_num_p, sleep_time_p, time_out_p)
- if res.status_code == 200:
- print getCurrentTime(), url, 'URL Connection Success: 共尝试', max_retry- tries_num_p, u'次', ',sleep_time:', sleep_time_p, ',time_out:', time_out_p
- else:
- print getCurrentTime(), url, 'URL Connection Error: 共尝试', max_retry- tries_num_p, u'次', ',sleep_time:', sleep_time_p, ',time_out:', time_out_p
- pass
-
- return res
-
- def getXiaoquList(fang_url):
- result = {}
- base_url = 'http://sh.lianjia.com'
- # res=requests.get(fang_url)
- res = getURL(fang_url)
- res.encoding = 'utf-8'
- soup = BeautifulSoup(res.text, 'html.parser')
- for fang in soup.select('.info-panel'):
- if (len(fang) > 0):
- try:
- result['xiaoqu_key'] = fang.select('h2')[0].a['key'].strip().lstrip().strip(" ")
- result['xiaoqu_name'] = fang.select('h2')[0].text.strip()
- result['xiaoqu_url'] = base_url + fang.select('h2')[0].a['href'].strip()
- result['quyu'] = fang.select('.con')[0].contents[1].text.strip()
- result['bankuai'] = fang.select('.con')[0].contents[3].text.strip()
- result['price'] = fang.select('.price')[0].span.text.strip() + fang.select('.price')[0].contents[2].strip()
- result['age'] = ''
- result['subway'] = ''
- result['onsale_num'] = ''
- result['fang_url'] = ''
- if len(fang.select('.con')[0].contents) >= 5:
- result['age'] = fang.select('.con')[0].contents[-1].string.strip()
- if len(fang.select('.fang-subway-ex')) > 0:
- result['subway'] = fang.select('.fang-subway-ex')[0].text.strip()
- if len(fang.select('.square')) > 0:
- result['onsale_num'] = fang.select('.square')[0].a.text.strip()
- if len(fang.select('.square')) > 0:
- result['fang_url'] = base_url + fang.select('.square')[0].a['href'].strip()
- getLianjiaList(result['fang_url'])
- result['updated_date']=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
- mySQL.insertData('lianjia_fang_xiaoqu', result)
- print getCurrentTime(), u'小区:', result['xiaoqu_key'], result['xiaoqu_name'], result['age'], result[ 'quyu'],result['bankuai'], \
- result['subway'], result['xiaoqu_url'], result['price'], result['onsale_num'], result['fang_url']
- getLianjiaList(result['fang_url'])
- except Exception, e:
- print getCurrentTime(), u"Exception:%d: %s" % (e.args[0], e.args[1])
- return result
-
- def getLianjiaList(fang_url):
- result = {}
- base_url = 'http://sh.lianjia.com'
- # res=requests.get(fang_url)
- res = getURL(fang_url)
- res.encoding = 'utf-8'
- soup = BeautifulSoup(res.text, 'html.parser')
- for fang in soup.select('.info-panel'):
- if (len(fang) > 0):
- result['fang_key'] = fang.select('h2')[0].a['key'].strip()
- result['fang_desc'] = fang.select('h2')[0].text.strip()
- result['fang_url'] = base_url + fang.select('h2')[0].a['href'].strip()
- result['price'] = fang.select('.price')[0].text.strip()
- result['price_pre'] = fang.select('.price-pre')[0].text.strip()
- result['xiaoqu
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。