赞
踩
科技加快了时代的进步,信息的传播更加迅速,面对大量的数据,数据挖据实现了对数据进行分析和获取相关知识的研究过程,目的为了使人们在其中提取所需要的数据。然而住房是民生之本,房价成了生活中大家所关注的问题,不少购房者将视线转移到二手的普通住房, 但更多的人更多的缺少信息筛选的方法。本文研究通过Python,采用 parsel 爬虫对二手房网站的北京市和昆明市二手房、租房源数据进行获取小区名称、类型、地址、售价等相关信息。经过数据清洗,使用 K-means 聚类算法,对所获取的 7000 余条数据中的北京房价进行聚类分析,将其属性相似度较高进行划分。
1.爬取链家网,获取二手房数据,
2.保存并读取数据
3.数据清洗、数据预处理
4.聚类分析
5.运用flask,实现可视化
1.对链家网的反爬机制有应对技术
2.数据保存格式要统一化,保存方式应当快捷高效
3.得到准确,无重复、无缺漏的数据
4.要根据在数据中发现的描述对象及其关系的信息,将数据对象分组。目的组内的对象相互之间是相似的
5.图表应该多样化,清晰简洁,具有一目了然的特点
主程序爬取网站
import requests import csv import parsel import time import pymysql import pandas as pd import numpy as np import wordcloud import jieba #建立代理库 agent = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.12022 SLBChan/105', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362', # Edge浏览器 版本44.18362.1.0 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', # IE浏览器 版本11.175.18362.0 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', # 火狐浏览器 x64 版本68.0 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0', # 火狐浏览器 x86 版本68.0 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', # Google浏览器 x64 版本75.0.3770.100 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', # Google浏览器 x86 版本72.0.3626.121 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', # Google浏览器 x86 版本75.0.3770.100 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', # 2345加速浏览器 版本9.9.0.19250 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36', # 2345加速浏览器 版本10.0.0.19291 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36 2345Explorer/10.0.0.19291', # 2345加速浏览器 版本10.0.0.19291 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3708.400 QQBrowser/10.4.3620.400', # QQ浏览器 版本10.4.2(3620) 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6823.400 QQBrowser/10.3.3117.400', # QQ浏览器 版本10.3.2(3117) 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400', # QQ浏览器 版本10.4.2(3587) 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', # 360浏览器 版本10.0.1920.0 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE', ] with open("链家.csv", mode='a', encoding='utf-8', newline='') as f: csv_write = csv.writer(f) csv_write.writerow(["类型","房名"," 位置", "简介", "优势", "每平米售价/月租", "总价", "详情页"]) for page in range(1,101): i = 0 headers = { 'User-Agent':agent[i] } url = f'https://bj.lianjia.com/ershoufang/pg{page}' response = requests.get(url = url,headers = headers) #输出状态码print(response) res_text = requests.get(url = url, headers = headers).text #查看返回的文本数据print(res_text) if len(res_text)<7000: i+=1 page-=1 continue #调用parsel库对返回的文本进行解析===>转成对象 selector = parsel.Selector(response.text) lis = selector.css('.sellListContent>li') time.sleep(1) for li in lis: # 房源名字 title = li.css('.title>a::text').get() # 房源位置 position = li.css('.positionInfo>a::text').getall() position = '-'.join(position) # 房源简介 intro = li.css('.houseInfo::text').get() # 优势 tag = li.css('.tag>span::text').getall() tag = '|'.join(tag) # 每平米售价 price = li.css('.unitPrice>span::text').get() # 总价 try: to_price = (li.css('.totalPrice.totalPrice2>span::text').get()) + '万' except Exception: continue # 详情页 href = li.css('.title>a::attr(href)').get() with open("链家.csv", mode='a', encoding='utf-8', newline='') as f: csv_write = csv.writer(f) csv_write.writerow(["二手房",title, position, intro, tag, price, to_price, href]) print("-----二手房第%d页爬取完毕-----"%page) for page in range(1,101): i = -1 headers = { 'User-Agent': agent[i] } url2 = f'https://bj.lianjia.com/zufang/pg{page}' response = requests.get(url = url2,headers = headers) #输出状态码print(response) res_text = requests.get(url = url2, headers = headers).text #查看返回的文本数据print(res_text) if len(res_text)<7000: i-=1 page-=1 continue #调用parsel库对返回的文本进行解析===>转成对象 selector = parsel.Selector(response.text) divs = selector.css('.content__list>div') for div in divs: #房源名字 title = div.css('.content__list--item--title>a::text').get() #房源位置 position = div.css('.content__list--item--des>a::text').getall() position = '-'.join(position) #房源简介 intro = div.css('.content__list--item--des::text').getall() intro = '/'.join(intro).replace(" ","").replace('/-',"").replace("\n","").replace("//","") # 优势 tag = div.css('p>i::text').getall() tag = '|'.join(tag).replace("/|","") #月租 price = div.css('.content__list--item--main>span>em::text').get()+" 元/月" #详情页 href = div.css('.content__list--item--main>p>a::attr(href)').get() href = "https: // bj.lianjia.com"+href with open("链家.csv",mode = 'a',encoding='utf-8',newline='') as f: csv_write = csv.writer(f) csv_write.writerow(["租房",title,position,intro,tag,price,"NULL",href]) print("-----租房第%d页爬取完毕-----"%page) for page in range(1,73): i=10 headers = { 'User-Agent': agent[i] } url3 = f'https://km.fang.lianjia.com/loupan/pg{page}' response = requests.get(url=url3, headers=headers) # 输出状态码print(response) res_text = requests.get(url=url3, headers=headers).text if len(res_text)<7000: i+=1 # 查看返回的文本数据print(res_text) # 调用parsel库对返回的文本进行解析===>转成对象 selector = parsel.Selector(response.text) divs = selector.css('.resblock-list-wrapper>li') for div in divs: # 房源名字 title = div.css('.resblock-name>a::text').get() # 房源类型 typ = div.css('.resblock-name>span::text').get() # 房源位置 location = div.css('.resblock-location>span::text').getall() location2 = div.css('.resblock-location>a::text').getall() loc = location[0] + " " + location[1] + " " + location2[0] # 价格 price = div.css('.resblock-price>.main-price>span::text').get() with open("昆明.csv", mode='a', encoding='utf-8', newline='') as f: csv_write = csv.writer(f) csv_write.writerow([title, typ, loc, price]) print("Run Successfully!") #将数据保存到MySQL中 def run_sql(path): connection = pymysql.connect( host="localhost", user="root", password="root", db="mysql", charset="utf8" ) cursor = connection.cursor() file = open("链家.csv", encoding='utf-8') df = pd.read_csv(file, encoding='utf-8') print(df.head()) # 创建数据表 cursor.execute('''create table houseInfo( id integer primary key auto_increment, house_type char(50), house_name varchar(200), location varchar(200), intro varchar(200), advantage varchar(200), price_or_rent char(50), price_all char(50), items varchar(200) )''') # 保存数据到数据表 for i in range(df.shape[0]): data = df.iloc[i] data = ( data["类型"], data["房名"], data["位置"], data["简介"], data["优势"], data["每平米售价/月租"], data["总价"], data["详情页"]) sql = "insert into houseInfo(house_type,house_name,location,intro,advantage,price_or_rent,price_all,items)values" + str( data) + ";" # 要与表的机构对其。第一个是主键,自增长的。 print(sql) try: cursor.execute(sql) # 执行sql语句 connection.commit() # 连接提交 except: connection.rollback() cursor.close() connection.close() return path = r'C:\Users\86150\PycharmProjects\pythonProject2\链家.csv' #run_sql(path) #数据清洗 def wash(): # 重复数据清洗 df= pd.read_csv("链家.csv",encoding="utf-8") print("查看重复的行数:",df.duplicated().sum()) print(df.shape) df.drop_duplicates(inplace=True) #重置索引 #df.index = range(df.shape[0]) #缺失值补全 df.loc[df.总价.isnull(),'总价'] = "无总价" #保存文件 df.to_csv("lianjia2.csv") wash() #数据预处理 #数据清理分组 df= pd.read_csv("链家.csv",encoding="utf-8") group = df.groupby("类型") df.to_csv("copy.csv",encoding="utf-8") df2 = pd.read_csv("copy.csv",encoding="utf-8") #二手房 df2 = pd.read_csv("copy.csv",encoding="utf-8") df2.drop(df2[df2.类型=="租房"].index,inplace=True) df2.drop(columns=["类型"],inplace=True) df2.drop(columns=["简介"],inplace=True) df2.drop(columns=["详情页"],inplace=True) df2.drop(columns=["总价"],inplace=True) df2.to_csv("ershou.csv",encoding="utf-8",index=False) #租房 df2 = pd.read_csv("copy.csv",encoding="utf-8") df2.drop(df2[df2.类型=="二手房"].index,inplace=True) df2.drop(columns=["类型"],inplace=True) df2.drop(columns=["简介"],inplace=True) df2.drop(columns=["详情页"],inplace=True) df2.to_csv("zufang.csv",encoding="utf-8",index=False) #数据可视化处理 f = open('昆明.csv' , encoding='utf-8') txt = f.read() txt_list = jieba.lcut(txt) string = ' '.join(txt_list) wc = wordcloud.WordCloud( width=1000, #词云的宽度 height=800, #高度 background_color='white', #背景颜色 font_path='msyh.ttc', scale=15 #密度,值越大,越清晰 ) wc.generate(string) wc.to_file("km.png")
此次实验中采用 K-means 方法进行昆明二手房价的聚类分析,通过可视化界面的展示更加让显示更加直观、可读性更高。大部分数据均采用图标的形式呈现简洁明了,对于爬取的数据整体完善,数据的清洗过程做的很好,使得数据的格式风格进行了统一,同时对于数据集的预处理处理的较好。在实验中采用先进行可视化分析,通过散点图的形式呈现,简洁直观的看到数据整体的一个分布情况,通过排除大于20000的个例数据,从而加大了预测的准确度。同时采用折线图、柱状图的形式呈现出每个地区所售房屋的数量、以及各个分类的体现。之后进行聚类分析,将聚类过程通过可视化折线图进行展示做的完善。以及将聚类后数据在进行一个逐一的分类,使得数据分析更加有力度。 但是在这个实验中对数据结构的优化度不够高。更多的呈现出代码冗余,没有对代码的 结构进行很好的优化。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。