图形:Bar(柱状图)、Pie(饼图)、Histogram(直方图) 、Scatter(散点图)、Map(地图)和WordCloud(词云图)
""" 读取excel数据,分析数据并生成图表 """ import pandas as pd from pyecharts import options as opts from pyecharts.charts import Bar, Pie, Scatter, WordCloud, Map, Page import numpy as np import jieba import jieba.analyse from pyecharts.commons.utils import JsCode # 面积分区 def cal_square_district(row): if row['面积'] <= 60: return '[0,60]' if row['面积'] > 60 and row['面积'] <= 90: return '[60,90]' if row['面积'] > 90 and row['面积'] <= 120: return '[90,120]' if row['面积'] > 120 and row['面积'] <= 150: return '[120,150]' if row['面积'] > 150 and row['面积'] <= 200: return '[150,200]' if row['面积'] > 200 and row['面积'] <= 300: return '[200, 300]' if row['面积'] > 300: return '[300,-]' return '[未知]' # 几室量化 def order_layout_ascending(row): if row['室'] == '1室': return 0 if row['室'] == '2室': return 1 if row['室'] == '3室': return 2 if row['室'] == '4室': return 3 if row['室'] == '5室': return 4 if row['室'] == '6室': return 5 # 颜色配置 layout_color_function = """ function (params) { if (params.value > 17000 && params.value < 18000) { return 'red'; } else if (params.value > 18000 && params.value < 20000) { return 'blue'; }else if (params.value > 20000 && params.value < 25000){ return 'green' }else if (params.value > 25000 && params.value < 35000){ return 'purple' }else if (params.value > 35000 && params.value < 40000){ return 'black' } return 'brown'; } """ # 按室均价 def unit_price_analysis_by_layout(df, isembed): # 增加一列[面积区间] df['面积区间'] = df.apply(cal_square_district, args=(), axis=1) # 获取要分析的数据行和列 analysis_df = df.loc[:, ['室', '均价']] analysis_df.loc[:, '室'] = analysis_df.loc[:, '室'].astype('str') # 对面积区间列group by,然后按分组计算总价和均价的平均值 group = analysis_df.groupby('室', as_index=False) group_df = group.mean() group_df.loc[:, '均价'] = group_df.loc[:, '均价'].astype('int') # 给室这个字段排个序 group_df['order'] = group_df.apply(order_layout_ascending, axis=1) group_df.sort_values('order', ascending=True, inplace=True) bar = ( Bar() .add_xaxis(group_df['室'].tolist()) .add_yaxis("单价均价", group_df["均价"].tolist(), itemstyle_opts=opts.ItemStyleOpts(color=JsCode(layout_color_function))) .set_global_opts(title_opts=opts.TitleOpts(title="武汉二手房按户型的房屋单价"), legend_opts=opts.LegendOpts(is_show=False)) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return bar.render_embed() else: return bar def order_square_ascending(row): if row['面积区间'] == '[0,60]': return 0 if row['面积区间'] == '[60,90]': return 1 if row['面积区间'] == '[90,120]': return 2 if row['面积区间'] == '[120,150]': return 3 if row['面积区间'] == '[150,200]': return 4 if row['面积区间'] == '[200,300]': return 5 if row['面积区间'] == '[300,-]': return 6 square_color_function = """ function (params) { if (params.value > 17000 && params.value < 18000) { return 'red'; } else if (params.value > 18000 && params.value < 20000) { return 'blue'; }else if (params.value > 20000 && params.value < 25000){ return 'green' }else if (params.value > 25000 && params.value < 35000){ return 'purple' }else if (params.value > 35000 && params.value < 40000){ return 'black' } return 'brown'; } """ # 按面积区间均价分布 def unit_price_analysis_by_square(df, isembed): # 增加一列[面积区间] df['面积区间'] = df.apply(cal_square_district, args=(), axis=1) # 获取要分析的数据行和列 analysis_df = df.loc[:, ['面积区间', '均价']] analysis_df.loc[:, '面积区间'] = analysis_df.loc[:, '面积区间'].astype('str') # 对面积区间列group by,然后按分组计算总价和均价的平均值 group = analysis_df.groupby('面积区间', as_index=False) group_df = group.mean() group_df.loc[:, '均价'] = group_df.loc[:, '均价'].astype('int') # 把面积区间按从小到大排个序 group_df['order'] = group_df.apply(order_square_ascending, axis=1) group_df.sort_values('order', ascending=True, inplace=True) bar = ( Bar() .add_xaxis(group_df['面积区间'].tolist()) .add_yaxis("单价均价", group_df["均价"].tolist(), itemstyle_opts=opts.ItemStyleOpts(color=JsCode(square_color_function))) .set_global_opts( title_opts=opts.TitleOpts(title="武汉二手房按面积区间的房屋单价"), legend_opts=opts.LegendOpts(is_show=False)) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return bar.render_embed() else: return bar top10_color_function = """ function (params) { if (params.value > 27000 && params.value < 27500) { return 'red'; } else if (params.value > 27500 && params.value < 27800) { return 'blue'; }else if (params.value > 27800 && params.value < 28000){ return 'green' }else if (params.value > 28000 && params.value < 29000){ return 'purple' }else if (params.value > 29000 && params.value < 30000){ return 'brown' }else if (params.value > 30000 && params.value < 35200){ return 'gray' }else if (params.value > 35200 && params.value < 37000){ return 'orange' }else if (params.value > 37000 && params.value < 40000){ return 'pink' }else if (params.value > 40000 && params.value < 45000){ return 'navy' } return 'gold'; } """ # 小区均价top10 def unit_price_analysis_by_estate(df, isembed): # 获取要分析的数据列 analysis_df = df.loc[:, ['小区名称', '均价']] analysis_df.loc[:, '小区名称'] = analysis_df.loc[:, '小区名称'].astype('str') # 对小区名称分组,然后按照分组计算单价均价 group = analysis_df.groupby('小区名称', as_index=False) group_df = group.mean() group_df.loc[:, '均价'] = group_df.loc[:, '均价'].astype('int') # 按照均价列降序排序 group_df.sort_values('均价', ascending=False, inplace=True) # 取Top10 top10_df = group_df.head(10) # print(top10_df) # 为了横向柱状图展示,再从低到高排序一下 top10_df.sort_values('均价', ascending=True, inplace=True) bar = ( Bar(init_opts=opts.InitOpts(width="1500px")) .add_xaxis(top10_df['小区名称'].tolist()) .add_yaxis("房价单价", top10_df['均价'].tolist(), itemstyle_opts=opts.ItemStyleOpts(color=JsCode(top10_color_function))) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts(title_opts=opts.TitleOpts(title="武汉各小区二手房房价TOP10"), xaxis_opts=opts.AxisOpts(axislabel_opts={'interval': '0'}), legend_opts=opts.LegendOpts(is_show=False)) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return bar.render_embed() else: return bar # 按区均价分布 def unit_price_analysis_by_district(df): # 获取要分析的数据列 analysis_df = df.loc[:, ['区', '均价']] analysis_df.loc[:, '区'] = analysis_df.loc[:, '区'].astype('str') # 对小区名称分组,然后按照分组计算单价均价 group = analysis_df.groupby('区', as_index=False) group_df = group.mean() group_df.loc[:, '均价'] = group_df.loc[:, '均价'].astype('int') # 按照均价列降序排序 group_df.sort_values('均价', ascending=True, inplace=True) bar = ( Bar(init_opts=opts.InitOpts(width="1500px")) .add_xaxis(group_df['区'].tolist()) .add_yaxis("房价单价", group_df['均价'].tolist()) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts(title_opts=opts.TitleOpts(title="武汉各区域二手房房价排行榜"), xaxis_opts=opts.AxisOpts(axislabel_opts={'interval': '0'})) ) return bar.render_embed() def add_sale_estate_col(row): return 0 # 不同建筑年份的待售数量 def sale_estate_analysis_by_year(df, isembed): # 增加一列待售房屋数,初始值均为0 df.loc[:, '待售房屋数'] = df.apply(add_sale_estate_col, axis=1) # 获取要用作数据分析的两列:建筑年份和待售房屋数 analysis_df = df.loc[:, ['建筑年份', '待售房屋数']] # 因为建筑年份列有空值,先预处理一下 analysis_df.dropna(inplace=True) # 按照建筑年份进行分组 group = analysis_df.groupby('建筑年份', as_index=False) # 对每个分组进行统计计数 group_df = group.count() group_df.loc[:, '待售房屋数'] = group_df.loc[:, '待售房屋数'].astype('int') pie = Pie(init_opts=opts.InitOpts(width='800px', height='600px', bg_color='white')) pie.add("pie", [list(z) for z in zip(group_df['建筑年份'].tolist(), group_df['待售房屋数'].tolist())] , radius=['40%', '60%'] , center=['50%', '50%'] , label_opts=opts.LabelOpts( position="outside", formatter="{b}:{c}:{d}%", ) ).set_global_opts( title_opts=opts.TitleOpts(title='武汉二手房不同建筑年份的待售数量', pos_left='300', pos_top='20', title_textstyle_opts=opts.TextStyleOpts(color='black', font_size=16)), legend_opts=opts.LegendOpts(is_show=False)) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return pie.render_embed() else: return pie # 均价价格分布 def unit_price_analysis_by_histogram(df, isembed): hist, bin_edges = np.histogram(df['均价'], bins=100) bar = ( Bar() .add_xaxis([str(x) for x in bin_edges[:-1]]) .add_yaxis('价格分布', [float(x) for x in hist], category_gap=0) .set_global_opts( title_opts=opts.TitleOpts(title='武汉二手房房价-单价分布-直方图', pos_left='center'), legend_opts=opts.LegendOpts(is_show=False) ) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return bar.render_embed() else: return bar # 总价价格分布 def total_price_analysis_by_histogram(df, isembed): hist, bin_edges = np.histogram(df['总价'], bins=100) bar = ( Bar() .add_xaxis([str(x) for x in bin_edges[:-1]]) .add_yaxis('价格分布', [float(x) for x in hist], category_gap=0) .set_global_opts( title_opts=opts.TitleOpts(title='武汉二手房房价-总价分布-直方图', pos_left='center'), legend_opts=opts.LegendOpts(is_show=False) ) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return bar.render_embed() else: return bar # 面积——单价关系 def unit_price_analysis_by_scatter(df, isembed): df.sort_values('面积', ascending=True, inplace=True) square = df['面积'].to_list() unit_price = df['均价'].to_list() scatter = ( Scatter() .add_xaxis(xaxis_data=square) .add_yaxis( series_name='', y_axis=unit_price, symbol_size=4, label_opts=opts.LabelOpts(is_show=False) ) .set_global_opts( xaxis_opts=opts.AxisOpts(type_='value'), yaxis_opts=opts.AxisOpts(type_='value'), title_opts=opts.TitleOpts(title='武汉二手房面积-单价关系图', pos_left='center') ) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return scatter.render_embed() else: return scatter # 房屋标题标签热度词 def hot_word_analysis_by_wordcloud(df, isembed): txt = '' for index, row in df.iterrows(): txt = txt + str(row['待售房屋']) + ';' + str(row['标签']) + '\n' word_weights = jieba.analyse.extract_tags(txt, topK=100, withWeight=True) word_cloud = ( WordCloud() .add(series_name='高频词语', data_pair=word_weights, word_size_range=[10, 100]) .set_global_opts( title_opts=opts.TitleOpts( title='武汉二手房销售热度词', title_textstyle_opts=opts.TextStyleOpts(font_size=23), pos_left='center' ) ) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return word_cloud.render_embed() else: # png_name = 'hot_word_analysis_by_wordcloud.png' # make_snapshot(snapshot, word_cloud.render(), f"crawler/anjuke/static/{png_name}") # return png_name return word_cloud # 规范区名 def transform_name(row): district_name = row['区'].strip() if district_name == '江汉' or district_name == '江岸' or district_name == '硚口' or district_name == '汉阳' or district_name == '武昌' or district_name == '东西湖' or district_name == '洪山': district_name = district_name + '区' return district_name # 按区均价分布地图 def unit_price_analysis_by_map(df, isembed): data = [] # 获取要分析的数据列 analysis_df = df.loc[:, ['区', '均价']] # 按区列分组 group_df = analysis_df.groupby('区', as_index=False) # 根据分组对均价列求平均值 group_df = group_df.mean('均价') # print(group_df) # 将区的名字做一下转换,为下面的地图匹配做准备 group_df['区'] = group_df.apply(transform_name, axis=1) group_df.loc[:, '均价'] = group_df.loc[:, '均价'].astype('int') # 将数据转换成map需要的数据格式 for index, row in group_df.iterrows(): district_array = [row['区'], row['均价']] data.append(district_array) map = ( Map() .add('武汉各区域二手房房价', data, '武汉') .set_global_opts( title_opts=opts.TitleOpts(title='武汉各区域二手房房价地图', pos_left='center'), visualmap_opts=opts.VisualMapOpts(max_=26000), legend_opts=opts.LegendOpts(is_show=False) ) ) # 判断是否单独显示,还是和其他图表一起显示 if isembed: return map.render_embed() else: # png_name = 'unit_price_analysis_by_map.png' # make_snapshot(snapshot, map.render(), f"crawler/anjuke/static/{png_name}") # return png_name return map # 主函数 if __name__ == '__main__': # 读取csv fpath = 'data/wuhanSecondHouse.csv' df = pd.read_csv(fpath, header=[0], encoding='gbk') df.drop_duplicates(keep='first', inplace=True) # 可视化 # 获取按面积区间的单价分析-柱状图 unit_price_analysis_by_square = unit_price_analysis_by_square(df, False) # 获取按室区分的单价分析-柱状图 unit_price_analysis_by_layout = unit_price_analysis_by_layout(df, False) # 获取苏州各小区二手房房价TOP10横向-柱状图 unit_price_analysis_by_estate = unit_price_analysis_by_estate(df, False) # 获取不同建筑年份的待售房屋数-饼图 sale_estate_analysis_by_year = sale_estate_analysis_by_year(df, False) # 苏州二手房房价-单价分布-直方图 unit_price_analysis_by_histogram = unit_price_analysis_by_histogram(df, False) # 苏州二手房房价-总价分布-直方图 total_price_analysis_by_histogram = total_price_analysis_by_histogram(df, False) # 苏州二手房面积-单价关系图 unit_price_analysis_by_scatter = unit_price_analysis_by_scatter(df, False) # 苏州二手房销售热度词-词云 # hot_word_analysis_by_wordcloud_png_name = dbc.hot_word_analysis_by_wordcloud(df,False) hot_word_analysis_by_wordcloud = hot_word_analysis_by_wordcloud(df, False) # 苏州各区域二手房房价分布-地图 # unit_price_analysis_by_map_png_name = dbc.unit_price_analysis_by_map(df,False) unit_price_analysis_by_map = unit_price_analysis_by_map(df, False) # web展示所有图 page = Page(layout=Page.DraggablePageLayout) # 可拖动布局 page.add( unit_price_analysis_by_square, unit_price_analysis_by_layout, unit_price_analysis_by_estate, sale_estate_analysis_by_year, unit_price_analysis_by_histogram, total_price_analysis_by_histogram, unit_price_analysis_by_scatter, hot_word_analysis_by_wordcloud, unit_price_analysis_by_map ) page.render("武汉二手房数据分析.html")
