赞
踩
决定毕业后前往深圳工作,想要了解深圳的租房市场。通过结合网上对链家网爬取经验,开始自己的爬取工作
后继还有租金、户型、朝向等不做限定
浏览网页后根据自己分析需要设计了如下宽表:
district1 区 | district2 区域 | address 详细地址 | rentType 出租类型 | size 大小 | toward 朝向 | zone 格局 | price 价格 | tag 标签 |
---|---|---|---|---|---|---|---|---|
import requests # 爬取网站
from bs4 import BeautifulSoup # 解析超文本
import re # 正则
import pymysql # 连接MySQL
import time # 时间限制
District = "luohuqu"
page = 2
RentType = "rt200600000002"
URL = "https://sz.lianjia.com/zufang/{}/pg{}{}/#contentList".format(District, page, RentType)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
,"Connection":"close"}
res = requests.get(URL, headers = headers)
print("URL = " + res.url)
print(res.text)
打印出来的网址能正常打开,超文本的输出也跟网站上查看的一样,可以看出链家网的爬取有URL和请求头就可以爬取到,极大方便了爬虫
接着进入下一步
request.get()
爬取网站BeautifulSoup
解析超文本def parseURL(URL, headers):
time.sleep(10) ## 时间限流,避免被反爬虫
try: ## 避免访问太快失败整个程序退出
res = requests.get(URL, headers = headers)
if (res.status_code==200):
html = res.text
soup = BeautifulSoup(html, 'lxml')
return soup
else:
print("访问{}失败".format(URL))
return None
except:
time.sleep(20)
print("访问{}失败".format(URL))
return None
lxml解析器有解析HTML和XML的功能,而且速度快,容错能力强
可以看到总套数在以上attrs中
注意class是python固定语言,在后面加个"_"
totalFind = int(soup.find('span', class_="content__title--hl").get_text()) # 获取文本后是str类型,需转换成int才能比较大小
get_text()
可以获取文本,返回str类型
因为总页数不在文本里,所以采用先获取该结点,再用正则匹配
def TotalPage(soup):
pg_content = soup.find_all('div', class_="content__pg")[0] ## 获取结点
totalPage = re.search('data-totalpage="(\d+)".*', str(pg_content)).group(1) ## 总页数不在文本里,所以需要正则来查找
return totalPage
增加:page
def getURL(District, page, RentType):
URL = "https://sz.lianjia.com/zufang/{}/pg{}{}/#contentList".format(District, page, RentType)
return URL
同一页面的每一套租房信息都在这种结点中,先找到所有这种结点
再根据宽表的设计找出需要信息的位置,进行提取
def getMessage(house): title = house.find('p', class_="content__list--item--title twoline").get_text(strip=True) ##strip=True:去除获得文本内容的前后空白 rentType = re.split(r'[·]', title)[0] ## 出租类型 message = house.find('p', class_="content__list--item--des").get_text(strip=True) message = re.split(r'[-/]', message) if (len(message)==7): district1 = message[0] ## 区 district2 = message[1] ## 进一步地址 address = message[2] ## 小区 size = message[3].replace("㎡", "") ## 大小,并把单位换掉 toward = message[4] ## 朝向 zone = message[5] ## 格局 tag = house.find('p', class_="content__list--item--bottom oneline").get_text(strip=True) ## 标签 price = house.find('span', class_="content__list--item-price").get_text(strip=True).replace("元/月", "") ## 价格 data = {'district1': district1 ,'district2': district2 ,'address': address ,'rentType': rentType ,'size':size ,'toward': toward ,'zone': zone ,'price': price ,'tag': tag } return data else: return None
def createDB(dbName):
db = pymysql.connect(host=localhost ,user='root', password='***', port=3306) # localhost填写你的主机名/IP;password填写你MySQL的密码
cursor = db.cursor()
sql = "CREATE DATABASE {} DEFAULT CHARACTER SET utf8".format(dbName)
cursor.execute(sql)
db.close()
def createTable(dbName, tableName): db = pymysql.connect(host=localhost, user='root', password='***', port=3306, db=dbName) cursor = db.cursor() sql = '''create table if not exists {} ( district1 VARCHAR(255) ,district2 VARCHAR(255) ,address VARCHAR(255) ,renttype VARCHAR(255) ,size VARCHAR(255) ,toward VARCHAR(255) ,zone VARCHAR(255) ,price VARCHAR(255) ,tag VARCHAR(255) );'''.format(tableName) cursor.execute(sql) db.close()
def InsertToSql(dbName, tableName, data):
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
db = pymysql.connect(host=localhost, user='root', password='***', port=3306, db=dbName)
cursor = db.cursor()
sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=tableName, keys=keys, values=values)
try:
if cursor.execute(sql, tuple(data.values())):
print('Successful')
db.commit()
except:
print('Failed')
db.rollback()
db.close()
dbName = 'test1' ## 数据库名称 tableName = 'test1' ## 创建表名称 createDB(dbName) ## 创建数据库 createTable(dbName, tableName) ## 创建表 District = Districts[6] ## 区 RentType = RentTypes[1] ## 出租类型 URL = "https://sz.lianjia.com/zufang/{}/{}/".format(District, RentType) soup = parseURL(URL, headers) if (soup != None): totalFind = int(soup.find('span', class_="content__title--hl").get_text()) ## 找到总套数 if (totalFind !=0): totalPage = int(TotalPage(soup)) print("{} {} 共找到 {} 套".format(District, RentType, totalFind)) houseElements = soup.find_all('div', class_="content__list--item") i = 1 for house in houseElements: data = getMessage(house) if (data !=None): InsertToSql(dbName, tableName, data) print("写入第{}套".format(i)) i += 1 if (totalPage > 1): for page in range(2, totalPage+2): URL = getURL(District=District, page=page, RentType=RentType) soup = parseURL(URL, headers) if (soup != None): houseElements = soup.find_all('div', class_="content__list--item") for house in houseElements: data = getMessage(house) InsertToSql(dbName, tableName, data) print("写入第{}套".format(i)) i += 1
# 需要导入的包 import requests from bs4 import BeautifulSoup import re import pymysql import time # 需要限制的东西 Districts = ['luohuqu', 'futianqu', 'nanshanqu', 'yantianqu', 'baoanqu', 'longgangqu', 'longhuaqu', 'guangmingqu', 'pingshanqu', 'dapengxinqu'] # 深圳的各个区的拼音 RentTypes = ['rt200600000001', 'rt200600000002'] # 出租类型 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" ,"Connection":"close"} # 得到非首页的网址 def getURL(District, page, RentType): URL = "https://sz.lianjia.com/zufang/{}/pg{}{}/#contentList".format(District, page, RentType) return URL # 解析URL def parseURL(URL, headers): time.sleep(10) try: res = requests.get(URL, headers = headers) if (res.status_code==200): html = res.text soup = BeautifulSoup(html, 'lxml') return soup else: print("访问{}失败".format(URL)) return None except: time.sleep(20) print("访问{}失败".format(URL)) return None # 总页数 def TotalPage(soup): pg_content = soup.find_all('div', class_="content__pg")[0] totalPage = re.search('data-totalpage="(\d+)".*', str(pg_content)).group(1) return totalPage # 得到具体信息 def getMessage(house): title = house.find('p', class_="content__list--item--title twoline").get_text(strip=True) ##strip=True:去除获得文本内容的前后空白 rentType = re.split(r'[·]', title)[0] ## 出租类型 message = house.find('p', class_="content__list--item--des").get_text(strip=True) message = re.split(r'[-/]', message) if (len(message)==7): district1 = message[0] ## 区 district2 = message[1] ## 进一步地址 address = message[2] ## 小区 size = message[3].replace("㎡", "") ## 大小 toward = message[4] ## 朝向 zone = message[5] ## 格局 tag = house.find('p', class_="content__list--item--bottom oneline").get_text(strip=True) ## 标签 price = house.find('span', class_="content__list--item-price").get_text(strip=True).replace("元/月", "") ## 价格 data = {'district1': district1 ,'district2': district2 ,'address': address ,'rentType': rentType ,'size':size ,'toward': toward ,'zone': zone ,'price': price ,'tag': tag } return data else: return None # 创建数据库 def createDB(dbName): db = pymysql.connect(db = pymysql.connect(host=localhost, user='root', password='***', port=3306, db=dbName)) cursor = db.cursor() sql = "CREATE DATABASE {} DEFAULT CHARACTER SET utf8".format(dbName) cursor.execute(sql) db.close() # 创建表 def createTable(dbName, tableName): db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3306, db=dbName) cursor = db.cursor() sql = '''create table if not exists {} ( district1 VARCHAR(255) ,district2 VARCHAR(255) ,address VARCHAR(255) ,renttype VARCHAR(255) ,size VARCHAR(255) ,toward VARCHAR(255) ,zone VARCHAR(255) ,price VARCHAR(255) ,tag VARCHAR(255) );'''.format(tableName) cursor.execute(sql) db.close() # 数据导入 def InsertToSql(dbName, tableName, data): keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3306, db=dbName) cursor = db.cursor() sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=tableName, keys=keys, values=values) try: if cursor.execute(sql, tuple(data.values())): db.commit() except: print('Failed') db.rollback() db.close() # 主函数 dbName = 'lianjiawang' ## 数据库名称 tableName = 'zufang_sz1' ## 创建表名称 ## createDB(dbName) ## 创建数据库 createTable(dbName, tableName) ## 创建表 # District = Districts[6] ## 区 # RentType = RentTypes[1] ## 出租类型 for District in Districts: for RentType in RentTypes: URL = "https://sz.lianjia.com/zufang/{}/{}/".format(District, RentType) soup = parseURL(URL, headers) if (soup != None): totalFind = int(soup.find('span', class_="content__title--hl").get_text()) ## 找到总套数 if (totalFind !=0): totalPage = int(TotalPage(soup)) # print("{} {} 共找到 {} 套".format(District, RentType, totalFind)) houseElements = soup.find_all('div', class_="content__list--item") i = 1 for house in houseElements: data = getMessage(house) if (data !=None): InsertToSql(dbName, tableName, data) # print("写入第{}套".format(i)) i += 1 if (totalPage > 1): for page in range(2, totalPage+2): URL = getURL(District=District, page=page, RentType=RentType) soup = parseURL(URL, headers) if (soup != None): houseElements = soup.find_all('div', class_="content__list--item") for house in houseElements: data = getMessage(house) if(data != None): InsertToSql(dbName, tableName, data) #print("写入第{}套".format(i)) #i += 1 print("深圳{} 租房类型:{}爬取成功".format(District, RentType))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。