赞
踩
爬虫说明:
1、本爬虫是以面向对象的方式进行代码架构的
2、本爬虫爬取的数据存入到MongoDB数据库中
3、爬虫代码中有详细注释
4、爬虫爬取的美食是以无锡为例
代码展示
import json import random import re import time from pymongo import MongoClient import requests from lxml import html class MeituanSpider(): def __init__(self): # 入口url self.start_url = 'https://chs.meituan.com/meishi/' # 首先需要登录自己的账号上 获取登录后的Cookie信息和User-Agent来构造响应头 self.headers = { # 修改成自己的cookie "Cookie": "_lxsdk_cuid=17567c82defc8-02c8aee262bc18-3e604000-144000-17567c82defc8; _hc.v=f17bef2e-9394-ea78-d6a7-940fc84143be.1614495157; mtcdn=K; ci=70; rvct=70%2C52; lsu=; uuid=99cbecdfcd6342ca9753.1617116140.1.0.0; _lx_utm=utm_source%3Dbaidu%26utm_medium%3Dorganic%26utm_term%3D%25E7%25BE%258E%25E5%259B%25A2; __mta=218988198.1617067475078.1617152337122.1617500202673.20; client-id=6cfcedec-72cb-470f-86a6-dddf64bc8869; lt=sOSqHk9WE66qIJX1xr-r9ytOpXsAAAAAJg0AAG99qBYNh2fwnJJ-MPffiG58lnM3m45u2teQdyug6LscHSf9jh_RDfoFcgz4UhgqfA; u=2585285025; n=%E9%A9%AC%E5%B0%91%E7%88%B1%E4%BD%A0%E4%B9%88%E4%B9%88%E5%93%92; token2=sOSqHk9WE66qIJX1xr-r9ytOpXsAAAAAJg0AAG99qBYNh2fwnJJ-MPffiG58lnM3m45u2teQdyug6LscHSf9jh_RDfoFcgz4UhgqfA; unc=%E9%A9%AC%E5%B0%91%E7%88%B1%E4%BD%A0%E4%B9%88%E4%B9%88%E5%93%92; firstTime=1617500314442; _lxsdk=17567c82defc8-02c8aee262bc18-3e604000-144000-17567c82defc8; _lxsdk_s=1789a866045-cd1-379-f7b%7C%7C6", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", } # 初始化MongoDB数据库并创建数据库连接 self.client = MongoClient() self.collection = self.client['test']['mt_foods'] # 获取需要爬取的url列表 def get_url_list(self,url,total_nums): url_temp = url+'pn{}/' # 每一页显示显示15个美食 通过获取到每个分类下的总美食数来求出总页数 pages = total_nums//15+1 if total_nums%15!=0 else total_nums//15 url_list = [url_temp.format(i) for i in range(1,pages+1)] return url_list # 对url进行请求并返回处理后的响应信息 def parse_url(self,url): # self.headers['Cookie'] = random.choice(self.cookies) time.sleep(1) rest = requests.get(url,headers=self.headers) html_str = re.findall(r'window._appState = (.*?);</script>', rest.content.decode())[0] return html_str # 创建item并进行存储 def get_content_list(self,html_str,item): json_html = json.loads(html_str) foods = json_html['poiLists']['poiInfos'] for i in foods: item['food_id'] = i['poiId'] item['food_url'] = "https://www.meituan.com/meishi/{}/".format(item['food_id']) item['title'] = i['title'] item['avg_score'] = i['avgScore'] item['avg_price'] = i['avgPrice'] item['comments'] = i['allCommentNum'] item['area'] = i['address'][0:3] item['address'] = i['address'] print(item) self.save(item) # 保存数据到mongodb数据库中 def save(self,item): self.collection.insert(item.copy()) # 主方法 def run(self): # 首先请求入口url来获取每一个美食分类的url地址 # 请看图例一 html_str = requests.get(self.start_url,headers=self.headers) # 代码已经改变 # html_str = html.etree.HTML(html_str.content.decode()) # cate_list = html_str.xpath('//div[text()="分类"]/../ul/li')[1:] str_html = re.findall(r'window._appState = (.*?);</script>',html_str.content.decode())[0] json_html = json.loads(str_html) cate_list = json_html['filters']['cates'][1:] item_list = [] # 对每一个分类进行分组分别获取美食的分类名和美食的分类的url for i in cate_list: item = {} # 分类的url进行反爬处理 # 从网页中获取的url地址为 http://wx.meituan.com/meishi/c11/ # 实际url地址为 https://wx.meituan.com/meishi/c11/ # 因此需要将http替换成https # cate_url= i.xpath('./a/@href')[0] cate_url = i['url'] item['cate_url'] = cate_url.replace('http','https') # item['cate_name'] = i.xpath('./a/text()')[0] item['cate_name'] = i['name'] item_list.append(item) # 对每一个美食分类的分类名和分类url地址进行遍历并分别进行处理 for i in item_list: # 睡眠1秒防止被识别为网络爬虫 time.sleep(1) rest = requests.get(i['cate_url'],headers = self.headers) str_html = rest.content.decode() str_html = re.findall(r'window._appState = (.*?);</script>', str_html)[0] json_html = json.loads(str_html) total_nums = json_html['poiLists']['totalCounts'] url_list = self.get_url_list(i['cate_url'],total_nums) for url in url_list: list_html = self.parse_url(url) self.get_content_list(list_html,i) if __name__ == '__main__': meituan = MeituanSpider() meituan.run()
数据分析和数据可视化说明:
1、本博客通过Flask框架来进行数据分析和数据可视化
2、项目的架构图为
代码展示
import pandas as pd import numpy as np import pymysql # 数据的预处理 def pre_process(df): # 删除无用列数据 比如说:url地址、美食id df.drop('cate_url', inplace=True, axis=1) df.drop('food_id', inplace=True, axis=1) # 删除数据中为NaN的数据 df.dropna(how='any', inplace=True) # 删除餐厅名称一样的重复数据 df.drop_duplicates(subset=['title'],inplace=True) return df # 无锡不同美食分类下的餐厅数量 def food_cate_count(df): # 按照美食分类进行分组获取不同美食分类的餐厅数量 grouped = df.groupby('cate_name')['title'].count().reset_index() data = [[i['cate_name'],i['title']] for i in grouped.to_dict(orient='records')] print(data) return data # 无锡不同美食分类下的餐厅的平均评分 def food_cate_score(df): # 按照美食分类进行分组获取不同分类的平均评分 grouped = df.groupby(['cate_name'])['avg_score'].mean().reset_index() data = [[i['cate_name'],round(i['avg_score'],1)] for i in grouped.to_dict(orient='records')] print(data) return data # 无锡不同美食分类下的餐厅的平均价格 def food_cate_price(df): # 按照美食分类进行分组获取不同分类的平均价格 grouped = df.groupby(['cate_name'])['avg_price'].mean().reset_index() data = [[i['cate_name'], round(i['avg_price'], 1)] for i in grouped.to_dict(orient='records')] print(data) return data # 无锡评论最多的前十家餐厅 def food_comment_top10(df): # 按照评论数量进行排序 获取评论数量最多的前十个餐厅 food_comments = df.sort_values(by='comments',ascending=False)[['title','comments']][:10] data = [[i['title'],i['comments']] for i in food_comments.to_dict(orient='records')] print(data) return data # 无锡不同地区的餐厅数量分布 def food_area_count(df): # 按照地区进行分组 grouped = df.groupby(['area'])['title'].count().reset_index() data = [[i['area'],i['title']] for i in grouped.to_dict(orient='records')] print(data) return data if __name__ == '__main__': # 读取美食数据 df = pd.read_json('美食.json', lines=True) # 打印基本信息 print(df.head(5)) print(df.info()) # 预处理 df = pre_process(df) # 无锡不同美食分类下的餐厅数量 # data = food_cate_count(df) # 无锡不同美食分类下的餐厅的平均评分 # data = food_cate_score(df) # 无锡评论最多的前十家餐厅 # data = food_comment_top10(df) # 无锡不同美食分类下的餐厅的平均价格 # data = food_cate_price(df) # 无锡不同地区的餐厅数量 data = food_area_count(df) # 创建数据库连接 conn = pymysql.connect(host='localhost',user='root',password='123456',port=3306,database='mt_food',charset='utf8') with conn.cursor() as cursor: # 无锡不同美食分类下的餐厅数量 # sql = "insert into db_food_cate_count(cate_name,count) values(%s,%s)" # 无锡不同美食分类下的餐厅的平均评分 # sql = "insert into db_food_cate_score(cate_name,avg_score) values(%s,%s)" # 无锡不同美食分类下的餐厅的平均价格 # sql = "insert into db_food_cate_price(cate_name,avg_price) values(%s,%s)" # 无锡评论最多的前十家餐厅 # sql = "insert into db_food_comment_top10(name,comment) values(%s,%s)" # 无锡不同地区的餐厅数量 sql = "insert into db_food_area_count(area,count) values(%s,%s)" try: result = cursor.executemany(sql,data) if result: print('插入数据成功') conn.commit() except pymysql.MySQLError as error: print(error) conn.rollback()
import json from pymongo import MongoClient # 将MongoDB中存储的数据转存到json文件中 def save_json(item_list): with open('美食.json', 'w', encoding='utf-8') as f: for item in item_list: json_item = json.dumps(item,ensure_ascii=False) f.write(json_item) f.write('\n') if __name__ == '__main__': client = MongoClient() connection = client['test']['mt_foods'] ret = connection.find({}, {'_id': 0}) data_list = list(ret) save_json(data_list)
from api_1_0 import db # 美食分类与餐厅数量关系模型 class FoodCateCount(db.Model): __tablename__ = 'db_food_cate_count' id = db.Column(db.Integer,primary_key=True,autoincrement=True) cate_name = db.Column(db.String(64),nullable=False) count = db.Column(db.Integer,nullable=False) # 美食分类与餐厅平均评分关系模型 class FoodCateScore(db.Model): __tablename__ = 'db_food_cate_score' id = db.Column(db.Integer, primary_key=True, autoincrement=True) cate_name = db.Column(db.String(64), nullable=False) avg_score = db.Column(db.Float, nullable=False) # 美食分类与餐厅平均价格关系模型 class FoodCatePrice(db.Model): __tablename__ = 'db_food_cate_price' id = db.Column(db.Integer, primary_key=True, autoincrement=True) cate_name = db.Column(db.String(64), nullable=False) avg_price = db.Column(db.Float, nullable=False) # 评论最多的前十个餐厅模型 class FoodCommentTop10(db.Model): __tablename__ = 'db_food_comment_top10' id = db.Column(db.Integer, primary_key=True, autoincrement=True) name = db.Column(db.String(64), nullable=False) comment = db.Column(db.Integer, nullable=False) # 地区与餐厅数量分布模型 class FoodAreaCount(db.Model): __tablename__ = 'db_food_area_count' id = db.Column(db.Integer, primary_key=True, autoincrement=True) area = db.Column(db.String(64), nullable=False) count = db.Column(db.Integer, nullable=False)
class Config(object): SECRET_KEY = '5211314' SQLALCHEMY_DATABASE_URI = 'mysql://root:123456@localhost:3306/mt_food' SQLALCHEMY_TRACK_MODIFICATIONS = True class DevelopmentConfig(Config): DEBUG = True class ProjectConfig(Config): pass config_map = { 'develop':DevelopmentConfig, 'project':ProjectConfig }
from flask_sqlalchemy import SQLAlchemy import pymysql from flask import Flask from config import config_map pymysql.install_as_MySQLdb() db = SQLAlchemy() def create_app(mode='develop'): # 创建app app = Flask(__name__) # 加载配置类 config = config_map[mode] app.config.from_object(config) # 加载数据库文件 db.init_app(app) # 注册蓝图 from api_1_0 import views app.register_blueprint(views.blue,url_prefix='/show') return app
from api_1_0 import create_app,db from flask_script import Manager from flask_migrate import Migrate,MigrateCommand from flask import render_template # 创建app app = create_app() # 开启命令行运行程序 manager = Manager(app) # 初始化数据库迁移库 Migrate(app,db) # 添加命令行命令 manager.add_command('db',MigrateCommand) # 首页 @app.route('/') def index(): return render_template('index.html') if __name__ == '__main__': manager.run()
_init_.py
from flask import Blueprint
# 为了在主程序运行时能够加载到模型类
from api_1_0 import model
blue = Blueprint('show',__name__)
# 导入定义的视图函数
from . import show
show.py
from api_1_0.views import blue from .. models import FoodAreaCount,FoodCateCount,FoodCatePrice,FoodCateScore,FoodCommentTop10 from flask import render_template ''' 将数据转换成列表的形式或者是列表嵌套字典的形式 locals()方法能够以字典的形式返回函数内所有声明的变量 ''' # 无锡不同地区的餐厅数量分布图 @blue.route('/drawGeo') def drawGeo(): area_count = FoodAreaCount.query.all() data = [{'name':i.area,'value':i.count} for i in area_count] max = sorted(data,key=lambda x:x['value'])[-1]['value'] min = sorted(data,key=lambda x:x['value'])[0]['value'] print(max) print(min) return render_template('drawGeo.html',**locals()) # 无锡评论最多的前十家餐厅柱状图 @blue.route('/drawSignalBar') def drawSignalBar(): food_comment = FoodCommentTop10.query.all() food_name = [i.name for i in food_comment] comment = [i.comment for i in food_comment] new_food_name = [] for i in food_name: i = list(i) if len(i)>6: i.insert(len(i) // 2, '\n') new_food_name.append(''.join(i)) return render_template('drawSignalBar.html', **locals()) # 不同美食分类的餐厅数量占比图 @blue.route('/drawPie') def drawPie(): cate_count = FoodCateCount.query.all() data = [{'name':i.cate_name,'value':i.count} for i in cate_count] return render_template('drawPie.html',**locals()) # 无锡不同美食分类下的餐厅的平均价格和餐厅的平均评分比较图 @blue.route('/drawBar') def drawBar(): cate_price = FoodCatePrice.query.all() cate_score = FoodCateScore.query.all() cate_name = [i.cate_name for i in cate_price] avg_price = [i.avg_price for i in cate_price] avg_score = [i.avg_score for i in cate_score] print(avg_score) return render_template('drawBar.html',**locals())
主页简单创建了四个超链接指向对应的图表
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>首页说明</title> <style> .container{ width: 100%; height: 600px; padding: 40px; line-height: 60px; } ul{ margin: auto; width: 60%; } </style> </head> <body> <div class="container"> <ul> <li><a href="http://127.0.0.1:5000/show/drawGeo" target="_blank"><h3>无锡不同地区的餐厅数量分布图</h3></a></li> <li><a href="http://127.0.0.1:5000/show/drawPie" target="_blank"><h3>不同美食分类的餐厅数量占比图</h3></a></li> <li><a href="http://127.0.0.1:5000/show/drawBar" target="_blank"><h3>无锡不同美食分类下的餐厅的平均价格&餐厅的平均评分比较图</h3></a></li> <li><a href="http://127.0.0.1:5000/show/drawSignalBar" target="_blank"><h3>无锡评论最多的前十家餐厅柱状图</h3></a></li> </ul> </div> </body> </html>
drawGeo.html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>无锡不同地区的餐厅数量分布图</title> <script src="../static/js/echarts.min.js"></script> <script src="../static/theme/vintage.js"></script> <script src="../static/js/jquery.min.js"></script> </head> <body> <div class="chart" style="width: 800px;height: 600px;margin: auto"></div> <script> var myCharts = echarts.init(document.querySelector('.chart'),'vintage') var data = {{ data|tojson }} var min = {{ min |tojson}} var max = {{ max |tojson}} $.get('../static/json/wuxi.json',function (ret) { echarts.registerMap('WuXiMap',ret) var option = { title:{ text:'无锡不同地区的餐厅数量分布', textStyle:{ fontFamily:'楷体', fontSize:21 }, top:20, left:20 }, tooltip:{ trigger:'item', triggerOn:'mousemove', formatter:function (arg){ return '地区名称:'+arg.name+"<br>"+'餐厅数量:'+arg.value } }, geo:{ type:'map', map:'WuXiMap', roam:true, label:{ show:true }, zoom:1 }, series:[ { type:'map', geoIndex:0, data:data, } ], visualMap:{ left: 20, bottom:10, min:min, max:max, inRange:{ color:['pink','red'] //控制颜色渐变的范围 }, calculable:true, } } myCharts.setOption(option) }) </script> </body> </html>
结论:
梁溪区的餐厅数量最多,餐饮业发达,从地理位置来看,梁溪区也位于无锡市的中心地带,因此经济相对发达。
drawBar.html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>无锡不同美食分类下的餐厅的平均价格&餐厅的平均评分比较图</title> <script src="../static/js/echarts.min.js"></script> <script src="../static/theme/vintage.js"></script> </head> <body> <div class="chart" style="width: 100%;height: 600px;margin: auto"></div> <script> var myCharts = echarts.init(document.querySelector('.chart'),'vintage') var cate_name = {{ cate_name|tojson }} var avg_price = {{ avg_price|tojson }} var avg_score = {{ avg_score|tojson }} var option = { title:{ text:'无锡不同美食分类下的餐厅的平均价格和餐厅的平均评分比较', textStyle:{ fontFamily:'楷体', fontSize:21 } }, xAxis:{ type:'category', data:cate_name, axisLabel:{ interval:0, rotate:30, margin:10 } }, yAxis: [ { type:'value', scale:true, max:100, min:0 }, { type:'value', scale:true, max: 5.0, min: 0 } ], grid:{ width:1600, left:50 }, legend:{ name:['平均价格','平均评分'], top:10 }, tooltip:{ trigger:'item', triggerOn:'mousemove', formatter:function (arg) { if (arg.seriesIndex == 0) { return '美食类别:'+arg.name+'<br>'+arg.seriesName+':'+arg.value+'元' }else{ return '美食类别:'+arg.name+'<br>'+arg.seriesName+':'+arg.value+'分' } } }, series:[ { type:'bar', name:'平均价格', data:avg_price, yAxisIndex:0, label:{ show:true, rotate:40, distance:10, position:'top' } }, { type:'bar', name:'平均评分', data:avg_score, yAxisIndex:1, label:{ show:true, rotate:40, distance:10, position:'top' } } ] } myCharts.setOption(option) </script> </body> </html>
结论:
小吃快餐、烫\粥\炖菜、蛋糕甜点的分类的价格普遍很低,而东南亚菜、日韩料理、自助餐价格普遍很高;京菜鲁菜、西北菜的评分普遍很低,而东南亚菜、素食的评分普遍很高。
drawPie.html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>不同美食分类的餐厅数量占比图</title> <script src="../static/js/echarts.min.js"></script> <script src="../static/theme/vintage.js"></script> </head> <body> <div class="chart" style="width: 1000px;height: 600px;margin: auto"></div> <script> var myCharts = echarts.init(document.querySelector('.chart'),'vintage') var data = {{ data|tojson }} var option = { title:{ text:'不同美食分类的餐厅数量占比', textStyle:{ fontFamily:'楷体', fontSize:21 }, top:10, left:10 }, legend:[ { name:['餐厅数量'], right:10, bottom:160, orient:'vertical', height:320 }, ], tooltip:{ trigger:'item', triggerOn:'mousemove', formatter:function (arg) { return '餐厅名称:'+arg.name+'<br>'+'评论数量:'+arg.value+'<br>'+'数量占比'+arg.percent+"%" } }, series:[ { name:'餐厅数量', type:'pie', data:data, label:{ show:true }, selectedMode:'multiple', selectedOffset:20, radius:['50%','70%'], center:['40%','50%'] } ] } myCharts.setOption(option) </script> </body> </html>
结论:
蛋糕甜点、小吃快餐、火锅占了总美食分类的一半左右,蛋糕甜点和小吃快餐能够满足人们快节奏生活的需求,而火锅店还是那么的受欢迎,是周末与朋友聚餐的必备场所。
drawSignalBar.html
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>无锡评论最多的前十家餐厅柱状图</title> <script src="../static/js/echarts.min.js"></script> <script src="../static/theme/vintage.js"></script> </head> <body> <div class="chart" style="width: 1000px;height: 600px;margin: auto"></div> <script> myCharts = echarts.init(document.querySelector('.chart'),'vintage') var food_name = {{ new_food_name|tojson }} var comment = {{ comment|tojson }} var option = { title:{ text:'无锡评论最多的前十家餐厅', textStyle:{ fontFamily:'楷体', fontSize:21 } }, grid:{ width:800, height:460 }, legend:{ name:['评论数量'] }, tooltip:{ trigger:'item', triggerOn:'mousemove', formatter:function (arg) { return '餐厅名称:'+arg.name+'<br>'+'评论数量:'+arg.value } }, xAxis:{ type:'category', data:food_name, axisLabel:{ interval:0, rotate:20, margin:20, lineHeight:16, }, }, yAxis:{ type:'value', scale:true }, series:[ { name:'评论数量', type:'bar', data: comment, label:{ show:true, position:'top', rotate: 30, distance:10 }, barWidth:50 } ] } myCharts.setOption(option) </script> </body> </html>
结论:
评论前十的餐厅有四个都是自助餐同时评论数量最多的餐厅也是自助餐,由此可见自助餐真的很受欢迎,也能够满足人们对于美食多样化的需求。
以下是项目源码,希望能够帮助你们,如有疑问,下方评论
flask项目代码链接
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。