赞
踩
上篇文章给大家分享了,Python旅游景点的详细数据
现在来分享各个城市的景点数据,并来做一个数据可视化图
确定目标需求
python采集旅游景点数据 / 去哪儿~
发送请求
获取数据
解析数据
保存数据
完整代码点击文末名片领取或者看代码中 +v
采集数据
导入模块
import requests
import parsel
import csv
import time
写入表格
f = open('张家界景点.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f, fieldnames=['景区', '星级', '地区', '热度', '销量', '地址',
'价格', '简介', '详情页'])
csv_writer.writeheader()
多页采集
for page in range(1, 12):
print(f'===============================正在爬取第{page}页数据内容=======================================')
time.sleep(2)
请求链接
url = f'https://*****.com/ticket/list_%E5%BC%A0%E5%AE%B6%E7%95%8C.html?from=mps_search_suggest_h&keyword=%E5%BC%A0%E5%AE%B6%E7%95%8C&page={page}'
请求头:把python代码伪装成浏览器 给服务器发送请求
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
获取网页文本数据 response.text
# print(response.text)
解析数据
css选择器 根据标签提取数据内容
第一次提取 所以景区标签内容 返回的页是一个对象 列表
id选择器 直接可以使用# 开头
selector = parsel.Selector(response.text) 完整代码+v领取:xiaoyuanllsll lis = selector.css('#search-list .sight_item_detail') for li in lis: title = li.css('.name::text').get() level = li.css('.level::text').get() area = li.css('.area a::text').get() hot = li.css('.product_star_level em::attr(title)').get().replace('热度: ', '') hot = int(float(hot)*100) address = li.css('.address span::attr(title)').get() price = li.css('.sight_item_price em::text').get() hot_num = li.css('.hot_num::text').get() intro = li.css('.intro::text').get() href = li.css('.name::attr(href)').get() href = 'https://*****.com/' + href dit = { '景区': title, '星级': level, '地区': area, '热度': hot, '销量': hot_num, '地址': address, '价格': price, '简介': intro, '详情页': href, } csv_writer.writerow(dit) print(title, level, area, hot, address, price, hot_num, intro, href, sep=' | ')
导入景点数据
完整代码点击文末名片领取或者看代码中 +v
import pandas as pd import numpy as np import matplotlib.pyplot as plt 完整代码+v领取:xiaoyuanllsll import seaborn as sns %matplotlib inline plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置加载的字体名 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题 import jieba import re from pyecharts.charts import * from pyecharts import options as opts from pyecharts.globals import ThemeType import stylecloud from IPython.display import Image df = pd.read_csv(r"c:\python\demo2\爬虫入门教程45 五一去哪儿玩?\去哪儿.csv") df.head()
删除重复数据
df = df.drop_duplicates()
查看数据信息
df.info()
景点价格Top20
df_qunarPrice = df.pivot_table(index='景区',values='价格')
df_qunarPrice.sort_values('价格',inplace=True,ascending=False)
df_data = df_qunarPrice[:20]
from pyecharts import options as opts
from pyecharts.charts import Bar
c = (
Bar()
.add_xaxis(df_data.index.tolist())
.add_yaxis("",df_data['价格'].values.tolist())
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="景点价格Top20"),xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=90)),)
)
c.render_notebook()
df_saleCount = df.pivot_table(index='景区',values='销量')
df_saleCount.sort_values('销量',inplace=True,ascending=False)
df_data = df_saleCount[:20]
df_data.values
评分TOP20景点
df_score = df.pivot_table(index='景区',values='热度')
df_score.sort_values('热度',inplace=True,ascending=False)
df_data = df_score[:20]
from pyecharts import options as opts
from pyecharts.charts import Bar
c = (
Bar()
.add_xaxis(df_data.index.tolist())
.add_yaxis("",df_data['热度'].values.tolist())
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="评分TOP20景点"),xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=90)),)
)
c.render_notebook()
月销量TOP20景点
df_saleCount = df.pivot_table(index='景区',values='销量') df_saleCount.sort_values('销量',inplace=True,ascending=False) df_data = df_saleCount[:20] from pyecharts import options as opts from pyecharts.charts import Bar c = ( Bar() .add_xaxis(df_data.index.tolist()) .add_yaxis("",df_data['销量'].values.tolist()) .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) .set_global_opts(title_opts=opts.TitleOpts(title="月销量TOP20景点"),xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=90)),) ) c.render_notebook()
景点等级分布
df_star = df["星级"].value_counts()
df_star = df_star.sort_values(ascending=False)
print(df_star)
c = (
Pie(init_opts=opts.InitOpts(theme=ThemeType.WALDEN))
.add(
"",
[list(z) for z in zip(df_star.index.to_list(),df_star.to_list())]
)
.set_global_opts(legend_opts = opts.LegendOpts(is_show = False),title_opts=opts.TitleOpts(title="景点等级分布",subtitle="数据来源:去哪儿网",pos_top="0.5%",pos_left = 'left'))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%",font_size=16))
)
c.render_notebook()
df[df["星级"]!='无'].sort_values("星级",ascending=False)
今天的分享到这里就结束了
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。