赞
踩
【开源】项目基于python+pandas+flask+mysql
等技术实现豆瓣电影数据获取及可视化分析展示,觉得有用的朋友可以来个一键三连,感谢!!!
【开源】2024最新python豆瓣电影数据爬虫+可视化分析项目
https://github.com/mudfish/python-douban-view
""" 异步并发爬虫 """ # 本次运行获取的最大页数 MAX_PAGES = 5 # 进度控制文件 PAGE_PROGRESS_FILE = "page_progress.json" # 电影类型 MOVIE_TYPES = ["剧情", "喜剧", "动作", "爱情", "科幻", "动画"] # CSV文件名 CSV_NAME = "movie_data.csv" # CSV头 CSV_HEADS = [ "id", "movie_id", "title", "year", "directors", "casts", "rating", "cover", "country", "summary", "types", "lang", "release_date", "time", "url", ] # 上映日期匹配正则,剔除非数字和- RELEASE_DATE_REMOVE_RE = r"[^0-9-]" engine = create_engine("mysql+pymysql://root:123456@127.0.0.1:3306/db_douban") def get_id(): return str(random.randint(1, 100000000)) + str(time.time()).split(".")[1].strip() class Spider: def __init__(self): self.movie_page_url = "https://m.douban.com/rexxar/api/v2/movie/recommend?" self.movie_detail_url = "https://movie.douban.com/subject/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Referer": "https://movie.douban.com/explore", } self.movie_types = MOVIE_TYPES self.page_progress = {} # 需要抓取的页面数 self.total_pages = 0 self.completed_pages = 0 self.global_progress_bar = None def init(self): # 每次跑之前,先删除之前的csv文件 if os.path.exists(CSV_NAME): os.remove(CSV_NAME) with open(CSV_NAME, "w", newline="", encoding="utf-8") as writer_f: writer = csv.writer(writer_f) writer.writerow(CSV_HEADS) def load_page_progress(self): if os.path.exists(PAGE_PROGRESS_FILE): with open(PAGE_PROGRESS_FILE, "r", encoding="utf-8") as f: # 判断文件内容是否为空 if os.stat(PAGE_PROGRESS_FILE).st_size == 0: # 初始化页面进度 print("初始化页面进度") self.page_progress = {} self.save_page_progress() else: self.page_progress = json.load(f) def save_page_progress(self): with open(PAGE_PROGRESS_FILE, "w", encoding="utf-8") as f: json.dump(self.page_progress, f, ensure_ascii=False) async def get_movie_pages(self, session, type_name): start_page = self.page_progress.get(type_name, 1) if start_page <= MAX_PAGES: for page in range(start_page, MAX_PAGES + 1): # print(f'{type_name}第{page}页:') start_time = time.time() params = {"start": (page - 1) * 20, "count": 10, "tags": type_name} try: async with session.get( self.movie_page_url, headers=self.headers, params=params ) as resp: resp.raise_for_status() respJson = await resp.json() movie_list = respJson["items"] for i, m in enumerate(movie_list): if m["type"] == "movie": await self.process_movie(session, m) # progress_bar.update(round(1/len(movie_list))) self.page_progress[type_name] = page + 1 # 记录进度 self.save_page_progress() # 刷新全局进度 self.update_global_progress() except Exception as e: print(f"处理:{type_name}第{page}页失败: {e}") traceback.print_exc() continue async def process_movie(self, session, movie): movie_data = [] movie_data.append(get_id()) movie_data.append(movie["id"]) movie_data.append(movie["title"]) movie_data.append(movie["year"]) async with session.get( self.movie_detail_url.format(movie["id"]), headers=self.headers ) as resp: resp.raise_for_status() html_text = await resp.text() path = etree.HTML(html_text) # 导演 movie_data.append(",".join(path.xpath('//a[@rel="v:directedBy"]/text()'))) # 主演 movie_data.append(",".join(path.xpath('//a[@rel="v:starring"]/text()'))) # 评分 movie_data.append(path.xpath('//strong[@property="v:average"]/text()')[0]) # 封面 movie_data.append(path.xpath('//img[@rel="v:image"]/@src')[0]) # 国家 movie_data.append( path.xpath( '//span[contains(text(),"制片国家")]/following-sibling::br[1]/preceding-sibling::text()[1]' )[0].replace(" / ", ",") ) # 摘要 movie_data.append(path.xpath('//span[@property="v:summary"]/text()')[0].strip()) # 类型 movie_data.append( ",".join(path.xpath('//div[@id="info"]/span[@property="v:genre"]/text()')) ) # 语言 movie_data.append( path.xpath( '//span[contains(text(),"语言")]/following-sibling::br[1]/preceding-sibling::text()[1]' )[0] ) # 上映日期 movie_data.append( re.sub( RELEASE_DATE_REMOVE_RE, "", path.xpath('//span[@property="v:initialReleaseDate"]/text()')[0][:10], ) ) # 时长(空处理) # print(movie["id"]) movie_time = path.xpath('//span[@property="v:runtime"]/text()') if len(movie_time) > 0: movie_data.append(movie_time[0]) else: movie_data.append("") # url movie_data.append(self.movie_detail_url.format(movie["id"])) self.save_to_csv(movie_data) def save_to_csv(self, row): with open(CSV_NAME, "a", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(row) def clean_csv(self): print("===========清理数据============") df = pd.read_csv(CSV_NAME, encoding="utf-8") df.drop_duplicates(subset=["movie_id"], keep="first", inplace=True) print("存储到数据库...") df.to_sql("tb_movie", con=engine, index=False, if_exists="append") print("清理重复数据...") engine.connect().execute( text( "delete t1 from tb_movie t1 inner join (select min(id) as id,movie_id from tb_movie group by movie_id having count(*) > 1) t2 on t1.movie_id=t2.movie_id where t1.id>t2.id" ) ) def update_global_progress(self): self.completed_pages += 1 # print(self.completed_pages) self.global_progress_bar.update(1) self.global_progress_bar.refresh() async def run(self): self.init() self.load_page_progress() # self.total_pages = MAX_PAGES*len(MOVIE_TYPES) - sum(self.page_progress.get(type_name, 1) for type_name in MOVIE_TYPES) for type_name in MOVIE_TYPES: if MAX_PAGES > self.page_progress.get(type_name, 1): self.total_pages += MAX_PAGES + 1 - self.page_progress.get(type_name, 1) print(self.total_pages) if self.total_pages > 0: self.global_progress_bar = tqdm( total=self.total_pages, desc="progress", unit="page", colour="GREEN" ) async with aiohttp.ClientSession() as session: tasks = [ self.get_movie_pages(session, type_name) for type_name in self.movie_types ] await asyncio.gather(*tasks) # 请求结束后,清空页面进度 # self.page_progress = {} # self.save_page_progress() self.global_progress_bar.close() self.clean_csv() if __name__ == "__main__": loop = asyncio.get_event_loop() spider = Spider() loop.run_until_complete(spider.run())
from flask import Flask, render_template, request, redirect, url_for, session from utils import db_query app = Flask(__name__) app.secret_key = "mysessionkey" # 统一请求拦截 @app.before_request def before_request(): # 利用正则匹配,如果/static开头和/login, /logout,/register的请求,则不拦截;其他的判断是否已登录 if ( request.path.startswith("/static") or request.path == "/login" or request.path == "/logout" or request.path == "/register" ): return # 如果没有登录,则跳转到登录页面 if not session.get("login_username"): return redirect(url_for("login")) # 首页 @app.route("/") def index(): # 获取电影统计数据 movie_stats = db_query.fetch_movie_statistics() # 获取电影分类统计 movie_type_distribution = db_query.fetch_movie_type_distribution() # 获取电影评分统计 movie_rating_distribution = db_query.fetch_movie_rating_distribution() print(movie_rating_distribution) return render_template( "index.html", login_username=session.get("login_username"), movie_stats=movie_stats, movie_type_distribution=movie_type_distribution, movie_rating_distribution=movie_rating_distribution, ) # 登录 @app.route("/login", methods=["GET", "POST"]) def login(): if request.method == "POST": req_params = dict(request.form) # 判断用户名密码是否正确 sql = "SELECT * FROM `tb_user` WHERE `username` = %s AND `password` = %s" params = (req_params["username"], req_params["password"]) if len(db_query.query(sql, params)) > 0: # 存储session session["login_username"] = req_params["username"] return redirect(url_for("index")) else: return render_template( "error.html", error="用户名或密码错误", ) elif request.method == "GET": return render_template("login.html") # 退出 @app.route("/logout") def logout(): session.pop("login_username", None) return redirect(url_for("index")) # 注册 @app.route("/register", methods=["GET", "POST"]) def register(): if request.method == "POST": req_params = dict(request.form) if req_params["password"] == req_params["password_confirm"]: # 判断是否已存在该用户名 sql = "SELECT * FROM `tb_user` WHERE `username` = %s" params = (req_params["username"],) result = db_query.query(sql, params) if len(result) > 0: return render_template( "error.html", error="用户名已存在", ) sql = "INSERT INTO `tb_user` (`username`, `password`) VALUES (%s, %s)" params = ( req_params["username"], req_params["password"], ) db_query.query(sql, params, db_query.QueryType.NO_SELECT) return redirect(url_for("login")) else: return render_template( "error.html", error="两次密码输入不一致", ) elif request.method == "GET": return render_template("register.html") @app.route("/list") def movie_list(): # 查询数据库获取电影列表 movies = db_query.fetch_movie_list() # 假设此函数返回一个包含电影信息的列表 # 渲染并返回list.html,同时传递movies数据 return render_template( "list.html", login_username=session.get("login_username"), movies=movies ) @app.errorhandler(404) def page_not_found(error): return render_template("404.html"), 404 @app.errorhandler(500) def system_error(error): return render_template("500.html"), 500 if __name__ == "__main__": # 静态文件缓存自动刷新 app.jinja_env.auto_reload = True app.run(host="127.0.0.1", port=8002, debug=True)
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8" /> <meta http-equiv="X-UA-Compatible" content="IE=edge" /> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" /> <meta name="description" content="" /> <meta name="author" content="" /> <title>首页</title> <!-- Custom fonts for this template--> <link href="/static/vendor/fontawesome-free/css/all.min.css" rel="stylesheet" type="text/css" /> <link href="https://fonts.googleapis.com/css?family=Nunito:200,200i,300,300i,400,400i,600,600i,700,700i,800,800i,900,900i" rel="stylesheet" /> <!-- Custom styles for this template--> <link href="/static/css/sb-admin-2.min.css" rel="stylesheet" /> </head> <body id="page-top"> <!-- Page Wrapper --> <div id="wrapper"> <!-- Sidebar --> <ul class="navbar-nav bg-gradient-primary sidebar sidebar-dark accordion" id="accordionSidebar" > <!-- Sidebar - Brand --> <a class="sidebar-brand d-flex align-items-center justify-content-center" href="index.html" > <div class="sidebar-brand-icon rotate-n-15"> <i class="fas fa-laugh-wink"></i> </div> <div class="sidebar-brand-text mx-3">豆瓣电影可视化</div> </a> <!-- Divider --> <hr class="sidebar-divider my-0" /> <!-- Nav Item - Dashboard --> <li class="nav-item active"> <a class="nav-link" href="/"> <i class="fas fa-fw fa-tachometer-alt"></i> <span>首页</span></a > </li> <!-- 列表 --> <li class="nav-item"> <a class="nav-link" href="/list"> <i class="fas fa-fw fa-table"></i> <span>电影列表</span></a > </li> <!-- Divider --> <hr class="sidebar-divider d-none d-md-block" /> <!-- Sidebar Toggler (Sidebar) --> <div class="text-center d-none d-md-inline"> <button class="rounded-circle border-0" id="sidebarToggle"></button> </div> </ul> <!-- End of Sidebar --> <!-- Content Wrapper --> <div id="content-wrapper" class="d-flex flex-column"> <!-- Main Content --> <div id="content"> <!-- Topbar --> <nav class="navbar navbar-expand navbar-light bg-white topbar mb-4 static-top shadow" > <!-- Sidebar Toggle (Topbar) --> <button id="sidebarToggleTop" class="btn btn-link d-md-none rounded-circle mr-3" > <i class="fa fa-bars"></i> </button> <!-- Topbar Search --> <!-- <form class="d-none d-sm-inline-block form-inline mr-auto ml-md-3 my-2 my-md-0 mw-100 navbar-search"> <div class="input-group"> <input type="text" class="form-control bg-light border-0 small" placeholder="Search for..." aria-label="Search" aria-describedby="basic-addon2"> <div class="input-group-append"> <button class="btn btn-primary" type="button"> <i class="fas fa-search fa-sm"></i> </button> </div> </div> </form> --> <!-- Topbar Navbar --> <ul class="navbar-nav ml-auto"> <div class="topbar-divider d-none d-sm-block"></div> <!-- Nav Item - User Information --> <li class="nav-item dropdown no-arrow"> <a class="nav-link dropdown-toggle" href="#" id="userDropdown" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false" > <span class="mr-2 d-none d-lg-inline text-gray-600 small" >{{login_username}}</span > <img class="img-profile rounded-circle" src="/static/img/avatar.png" /> </a> <!-- Dropdown - User Information --> <div class="dropdown-menu dropdown-menu-right shadow animated--grow-in" aria-labelledby="userDropdown" > <a class="dropdown-item" href="#" data-toggle="modal" data-target="#logoutModal" > <i class="fas fa-sign-out-alt fa-sm fa-fw mr-2 text-gray-400" ></i> Logout </a> </div> </li> </ul> </nav> <!-- End of Topbar --> <!-- Begin Page Content --> <div class="container-fluid"> <!-- Page Heading --> <!-- <div class="d-sm-flex align-items-center justify-content-between mb-4"> <h1 class="h3 mb-0 text-gray-800">Dashboard</h1> <a href="#" class="d-none d-sm-inline-block btn btn-sm btn-primary shadow-sm"><i class="fas fa-download fa-sm text-white-50"></i> Generate Report</a> </div> --> <!-- Content Row --> <div class="row"> <!-- Earnings (Monthly) Card Example --> <div class="col-xl-3 col-md-6 mb-4"> <div class="card border-left-primary shadow h-100 py-2"> <div class="card-body"> <div class="row no-gutters align-items-center"> <div class="col mr-2"> <div class="font-weight-bold text-primary text-uppercase mb-1" > 电影总数 </div> <div class="h5 mb-0 font-weight-bold text-gray-800"> {{ movie_stats['total_movies'] }} </div> </div> <div class="col-auto"> <i class="fas fa-calendar fa-2x text-gray-300"></i> </div> </div> </div> </div> </div> <!-- Earnings (Monthly) Card Example --> <div class="col-xl-3 col-md-6 mb-4"> <div class="card border-left-success shadow h-100 py-2"> <div class="card-body"> <div class="row no-gutters align-items-center"> <div class="col mr-2"> <div class="font-weight-bold text-success text-uppercase mb-1" > 电影最高评分 </div> <div class="h5 mb-0 font-weight-bold text-gray-800"> {{ movie_stats['highest_rating'] }} </div> </div> <div class="col-auto"> <i class="fas fa-dollar-sign fa-2x text-gray-300"></i> </div> </div> </div> </div> </div> <!-- Earnings (Monthly) Card Example --> <div class="col-xl-3 col-md-6 mb-4"> <div class="card border-left-info shadow h-100 py-2"> <div class="card-body"> <div class="row no-gutters align-items-center"> <div class="col mr-2"> <div class="font-weight-bold text-info text-uppercase mb-1" > 出演最多演员 </div> <div class="row no-gutters align-items-center"> <div class="col-auto"> <div class="h5 mb-0 mr-3 font-weight-bold text-gray-800" > {{ movie_stats['most_popular_cast'] }} </div> </div> <div class="col"> <div class="progress progress-sm mr-2"> <div class="progress-bar bg-info" role="progressbar" style="width: 50%" aria-valuenow="50" aria-valuemin="0" aria-valuemax="100" ></div> </div> </div> </div> </div> <div class="col-auto"> <i class="fas fa-clipboard-list fa-2x text-gray-300" ></i> </div> </div> </div> </div> </div> <!-- Pending Requests Card Example --> <div class="col-xl-3 col-md-6 mb-4"> <div class="card border-left-warning shadow h-100 py-2"> <div class="card-body"> <div class="row no-gutters align-items-center"> <div class="col mr-2"> <div class="font-weight-bold text-warning text-uppercase mb-1" > 制片最多国家 </div> <div class="h5 mb-0 font-weight-bold text-gray-800"> {{ movie_stats['most_common_country'] }} </div> </div> <div class="col-auto"> <i class="fas fa-comments fa-2x text-gray-300"></i> </div> </div> </div> </div> </div> </div> <!-- Content Row --> <div class="row"> <!-- Area Chart --> <div class="col-xl-6 col-lg-6"> <div class="card shadow mb-4"> <!-- Card Header - Dropdown --> <div class="card-header py-3 d-flex flex-row align-items-center justify-content-between" > <h6 class="m-0 font-weight-bold text-primary"> 电影分类统计 </h6> </div> <!-- Card Body --> <div class="card-body"> <div id="movie_type_chart" style="width: 100%; height: 450px" ></div> <!-- <div class="chart-area"> </div> --> </div> </div> </div> <!-- Line Chart --> <div class="col-xl-6 col-lg-6"> <div class="card shadow mb-4"> <!-- Card Header - Dropdown --> <div class="card-header py-3 d-flex flex-row align-items-center justify-content-between" > <h6 class="m-0 font-weight-bold text-primary"> 电影评分统计 </h6> <div class="dropdown no-arrow"> <a class="dropdown-toggle" href="#" role="button" id="dropdownMenuLink" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false" > <i class="fas fa-ellipsis-v fa-sm fa-fw text-gray-400" ></i> </a> <div class="dropdown-menu dropdown-menu-right shadow animated--fade-in" aria-labelledby="dropdownMenuLink" > <div class="dropdown-header">Dropdown Header:</div> <a class="dropdown-item" href="#">Action</a> <a class="dropdown-item" href="#">Another action</a> <div class="dropdown-divider"></div> <a class="dropdown-item" href="#" >Something else here</a > </div> </div> </div> <!-- Card Body --> <div class="card-body"> <div id="movie_score_chart" style="width: 100%; height: 450px" ></div> </div> </div> </div> </div> <!-- Content Row --> </div> <!-- /.container-fluid --> </div> <!-- End of Main Content --> <!-- Footer --> <footer class="sticky-footer bg-white"> <div class="container my-auto"> <div class="copyright text-center my-auto"> <span >@Laoxu Open Source.<a target="_blank" href="https://github.com/mudfish" >Github</a ></span > </div> </div> </footer> <!-- End of Footer --> </div> <!-- End of Content Wrapper --> </div> <!-- End of Page Wrapper --> <!-- Scroll to Top Button--> <a class="scroll-to-top rounded" href="#page-top"> <i class="fas fa-angle-up"></i> </a> <!-- Logout Modal--> <div class="modal fade" id="logoutModal" tabindex="-1" role="dialog" aria-labelledby="exampleModalLabel" aria-hidden="true" > <div class="modal-dialog" role="document"> <div class="modal-content"> <div class="modal-header"> <h5 class="modal-title" id="exampleModalLabel">Ready to Leave?</h5> <button class="close" type="button" data-dismiss="modal" aria-label="Close" > <span aria-hidden="true">×</span> </button> </div> <!-- <div class="modal-body">Select "Logout" below if you are ready to end your current session.</div> --> <div class="modal-footer"> <button class="btn btn-secondary" type="button" data-dismiss="modal" > Cancel </button> <a class="btn btn-primary" href="/logout">Logout</a> </div> </div> </div> </div> <!-- Bootstrap core JavaScript--> <script src="/static/vendor/jquery/jquery.min.js"></script> <script src="/static/vendor/bootstrap/js/bootstrap.bundle.min.js"></script> <!-- Core plugin JavaScript--> <script src="/static/vendor/jquery-easing/jquery.easing.min.js"></script> <!-- Custom scripts for all pages--> <script src="/static/js/sb-admin-2.min.js"></script> <!-- Page level plugins --> <script src="/static/vendor/chart.js/Chart.min.js"></script> <!-- Page level custom scripts --> <script src="/static/js/demo/chart-area-demo.js"></script> <script src="/static/js/demo/chart-pie-demo.js"></script> <script src="/static/js/echarts.min.js"></script> <script> var chartDom = document.getElementById("movie_type_chart"); var myChart = echarts.init(chartDom); var option; var movieTypeData = {{ movie_type_distribution|tojson }}; // console.log(movieTypeData) option = { title: { text: "", subtext: "来源:豆瓣数据", left: "center", }, tooltip: { trigger: "item", }, legend: { orient: "vertical", left: "left", }, series: [ { name: "Access From", type: "pie", radius: "50%", data: movieTypeData, emphasis: { itemStyle: { shadowBlur: 10, shadowOffsetX: 0, shadowColor: "rgba(0, 0, 0, 0.5)", }, }, }, ], }; option && myChart.setOption(option); </script> <script> var chartDom = document.getElementById("movie_score_chart"); var myChart = echarts.init(chartDom); var option; var ratingData = {{ movie_rating_distribution|tojson }}; console.log(ratingData) option = { title: { text: "", subtext: "来源:豆瓣数据", left: "center", }, xAxis: { type: "category", boundaryGap: false, data: ratingData.map(item => item[0]), }, yAxis: { type: "value", }, series: [ { data: ratingData.map(item => item[1]), type: "line", areaStyle: {}, }, ], tooltip: { trigger: 'axis', //坐标轴触发,主要在柱状图,折线图等会使用类目轴的图表中使用 axisPointer: {// 坐标轴指示器,坐标轴触发有效 type: 'shadow' // 默认为直线,可选为:'line' | 'shadow' } }, }; option && myChart.setOption(option); </script> </body> </html>
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。